diff --git a/.ci/docker/Dockerfile b/.ci/docker/Dockerfile index a0b137efe8d..cb591488a76 100644 --- a/.ci/docker/Dockerfile +++ b/.ci/docker/Dockerfile @@ -5,16 +5,9 @@ FROM nvidia/cuda:12.1.0-runtime-ubuntu20.04@sha256:c1869c30f46fff478a37ed58d9dace7e08519541274f03424d0b78bd35b2c73a AS python_base_cuda LABEL maintainer="OpenVINO Training Extensions Development Team" -ARG HTTP_PROXY -ARG HTTPS_PROXY -ARG NO_PROXY ARG uid ARG gid -# Setup proxies -ENV http_proxy=$HTTP_PROXY -ENV https_proxy=$HTTPS_PROXY -ENV no_proxy=$NO_PROXY ENV DEBIAN_FRONTEND="noninteractive" # hadolint ignore=DL3008 diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 063c765b4ac..28d1c40f3f8 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -53,9 +53,6 @@ fi TAG=$1 docker build -f ./Dockerfile \ ---build-arg HTTP_PROXY="${http_proxy:?}" \ ---build-arg HTTPS_PROXY="${https_proxy:?}" \ ---build-arg NO_PROXY="${no_proxy:?}" \ --build-arg ACTIONS_RUNNER_VER="$ACTIONS_RUNNER_VER" \ --build-arg gid="$(id -g)" \ --build-arg uid="$UID" \ diff --git a/.ci/docker/start-runner.sh b/.ci/docker/start-runner.sh index 2e38efcd0d3..dd1fbf5cc31 100755 --- a/.ci/docker/start-runner.sh +++ b/.ci/docker/start-runner.sh @@ -1,7 +1,7 @@ #!/bin/bash GPU_ID="all" -VER_CUDA="11.7.1" +VER_CUDA="12.1.0" TAG_RUNNER="latest" ADDITIONAL_LABELS="" MOUNT_PATH="" @@ -149,9 +149,6 @@ if [ "$DEBUG_CONTAINER" = true ]; then --name "$CONTAINER_NAME" \ -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" \ ${ENV_FLAGS} \ - -e http_proxy=http://proxy-chain.intel.com:911 \ - -e https_proxy=http://proxy-chain.intel.com:912 \ - -e no_proxy=intel.com,.intel.com,localhost,127.0.0.0/8 \ ${MOUNT_FLAGS} \ ${CACHE_MOUNT_FLAGS} \ "$DOCKER_REG_ADDR"/ote/ci/cu"$VER_CUDA"/runner:"$TAG_RUNNER"; RET=$? @@ -172,9 +169,6 @@ else --name "$CONTAINER_NAME" \ -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" \ ${ENV_FLAGS} \ - -e http_proxy=http://proxy-chain.intel.com:911 \ - -e https_proxy=http://proxy-chain.intel.com:912 \ - -e no_proxy=intel.com,.intel.com,localhost,127.0.0.0/8 \ ${MOUNT_FLAGS} \ ${CACHE_MOUNT_FLAGS} \ "$DOCKER_REG_ADDR"/ote/ci/cu"$VER_CUDA"/runner:"$TAG_RUNNER"; RET=$? diff --git a/.github/workflows/pre_merge.yaml b/.github/workflows/pre_merge.yaml index b84dbb2b93e..f1964e4b64b 100644 --- a/.github/workflows/pre_merge.yaml +++ b/.github/workflows/pre_merge.yaml @@ -101,9 +101,6 @@ jobs: - task: "multi_cls_classification" - task: "multi_label_classification" - task: "hlabel_classification" - - task: "detection" - - task: "instance_segmentation" - - task: "semantic_segmentation" - task: "visual_prompting" - task: "zero_shot_visual_prompting" - task: "anomaly_classification" @@ -127,3 +124,32 @@ jobs: rm /tmp/requirements.txt - name: Run Integration Test run: tox -vv -e integration-test-${{ matrix.task }} + Integration-Test-Large: + if: | + github.event.pull_request.draft == false && + !(startsWith(github.event.pull_request.title, '[WIP]')) + runs-on: [self-hosted, linux, x64, dev, dmount] + needs: Unit-Test + strategy: + fail-fast: false + matrix: + include: + - task: "detection" + - task: "instance_segmentation" + - task: "semantic_segmentation" + name: Integration-Test-Large-${{ matrix.task }}-py310 + steps: + - name: Checkout repository + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Install Python + uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f # v5.1.1 + with: + python-version: "3.10" + - name: Install tox + run: | + python -m pip install --require-hashes --no-deps -r .ci/requirements.txt + pip-compile --generate-hashes --output-file=/tmp/requirements.txt --extra=ci_tox pyproject.toml + python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt + rm /tmp/requirements.txt + - name: Run Integration Test + run: tox -vv -e integration-test-${{ matrix.task }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 15130d89995..f39500d2c8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,78 +7,82 @@ All notable changes to this project will be documented in this file. ### New features - Turn on/off classification augmentations - (https://github.com/openvinotoolkit/training_extensions/pull/4039) + () ### Enhancements - Update visual prompting pipeline for multi-label zero-shot learning support - (https://github.com/openvinotoolkit/training_extensions/pull/3993) + () - Update to work torch compile in detection - (https://github.com/openvinotoolkit/training_extensions/pull/4003) + () - Refactor MaskDINO - (https://github.com/openvinotoolkit/training_extensions/pull/4006) + () - Fix MaskRCNN/RTMDet-Inst/MaskRCNNTV Explain Mode - (https://github.com/openvinotoolkit/training_extensions/pull/4053) + () ## \[2.3.0\] ### New features - Add YOLOv9 model for Object Detection - (https://github.com/openvinotoolkit/training_extensions/pull/3917) + (, ) - Add OV inference for keypoint detection - (https://github.com/openvinotoolkit/training_extensions/pull/3970) + () - Add tiling for semantic segmentation - (https://github.com/openvinotoolkit/training_extensions/pull/3954) + () - Add 3D Object Detection task with MonoDETR model - (https://github.com/openvinotoolkit/training_extensions/pull/3979) + () +- Add OpenVINO inference for 3D Object Detection task + () ### Enhancements - Upgrade OV, MAPI, and NNCF dependencies - (https://github.com/openvinotoolkit/training_extensions/pull/3967) + () - Instance Segmentation Model refactoring - (https://github.com/openvinotoolkit/training_extensions/pull/3865) + () - Bump torch and lightning to 2.4.0 versions - (https://github.com/openvinotoolkit/training_extensions/pull/3843) + () - Add mAP metric to evaluate multilabel classification - (https://github.com/openvinotoolkit/training_extensions/pull/3985) + () ### Bug fixes - Fix a wrong HPO log - (https://github.com/openvinotoolkit/training_extensions/pull/3972) + () +- Update model name in rotated detection recipes + () ## \[2.2.0\] ### New features - Add RT-DETR model for Object Detection - (https://github.com/openvinotoolkit/training_extensions/pull/3741) + () - Add Multi-Label & H-label Classification with torchvision models - (https://github.com/openvinotoolkit/training_extensions/pull/3697) + () - Add Hugging-Face Model Wrapper for Classification - (https://github.com/openvinotoolkit/training_extensions/pull/3710) + () - Add LoRA finetuning capability for ViT Architectures - (https://github.com/openvinotoolkit/training_extensions/pull/3729) + () - Add Hugging-Face Model Wrapper for Object Detection - (https://github.com/openvinotoolkit/training_extensions/pull/3747) + () - Add Hugging-Face Model Wrapper for Semantic Segmentation - (https://github.com/openvinotoolkit/training_extensions/pull/3749) + () - Enable torch.compile to work with classification - (https://github.com/openvinotoolkit/training_extensions/pull/3758) + () - Add `otx benchmark` subcommand - (https://github.com/openvinotoolkit/training_extensions/pull/3762) + () - Add RTMPose for Keypoint Detection Task - (https://github.com/openvinotoolkit/training_extensions/pull/3781) + (, ) - Add Semi-SL MeanTeacher algorithm for Semantic Segmentation - (https://github.com/openvinotoolkit/training_extensions/pull/3801) + () - Update head and h-label format for hierarchical label classification - (https://github.com/openvinotoolkit/training_extensions/pull/3810) + () - Support configurable input size - (https://github.com/openvinotoolkit/training_extensions/pull/3788) + () - Add diffusion task - (https://github.com/openvinotoolkit/training_extensions/pull/3875) + () ### Enhancements @@ -106,9 +110,21 @@ All notable changes to this project will be documented in this file. () - Change sematic segmentation to consider bbox only annotations () +- Relieve memory usage criteria on batch size 2 during adaptive batch size + () +- Remove background label from RT Info for segmentation task + () +- Prevent using too low confidence thresholds in detection + () +- Update HPO interface + () +- Bump onnx to 1.17.0 to omit CVE-2024-5187 + () ### Bug fixes +- Update anomaly base transforms to use square resizing + () - Fix Combined Dataloader & unlabeled warmup loss in Semi-SL () - Revert #3579 to fix issues with replacing coco_instance with a different format in some dataset @@ -121,6 +137,30 @@ All notable changes to this project will be documented in this file. () - Fix config converter for tiling () +- Fix `BboxOverlaps2D` handling of empty ground-truth annotations in datasets. + () +- Fix num_trials calculation on dataset length less than num_class + () +- Fix out_features in HierarchicalCBAMClsHead + () +- Fix multilabel_accuracy of MixedHLabelAccuracy + () +- Fix wrong indices setting in HLabelInfo + () +- Add legacy template LiteHRNet_18 template + () +- Model templates: rename model_status value 'DISCONTINUED' to 'OBSOLETE' + () +- Enable export of feature vectors for semantic segmentation task + () +- Update MRCNN model export to include feature vector and saliency map + () +- Upgrade MAPI in 2.2 + () +- Fix applying model's hparams when loading model from checkpoint + () +- Fix incorrect all_groups order configuration in HLabelInfo + () ## \[v2.1.0\] diff --git a/README.md b/README.md index f42741bb689..435415abb0f 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,9 @@ In addition to the examples above, please refer to the documentation for tutoria - Include full image with anno in case there's no tile in tile dataset - Add type checker in converter for callable functions (optimizer, scheduler) - Change sematic segmentation to consider bbox only annotations +- Relieve memory usage criteria on batch size 2 during adaptive batch size +- Remove background label from RT Info for segmentation task +- Prevent using too low confidence thresholds in detection ### Bug fixes @@ -206,6 +209,10 @@ In addition to the examples above, please refer to the documentation for tutoria - Add missing tile recipes and various tile recipe changes - Change categories mapping logic - Fix config converter for tiling +- Fix num_trials calculation on dataset length less than num_class +- Fix out_features in HierarchicalCBAMClsHead +- Fix multilabel_accuracy of MixedHLabelAccuracy +- Fix wrong indices setting in HLabelInfo ### Known issues diff --git a/docs/source/guide/release_notes/index.rst b/docs/source/guide/release_notes/index.rst index a0e0954c2a8..e0b8dc86383 100644 --- a/docs/source/guide/release_notes/index.rst +++ b/docs/source/guide/release_notes/index.rst @@ -4,7 +4,7 @@ Releases .. toctree:: :maxdepth: 1 -v2.2.0 (2024.09) +v2.2.0 (2024.10) ---------------- New features @@ -38,6 +38,9 @@ Enhancements - Include full image with anno in case there's no tile in tile dataset - Add type checker in converter for callable functions (optimizer, scheduler) - Change sematic segmentation to consider bbox only annotations +- Relieve memory usage criteria on batch size 2 during adaptive batch size +- Remove background label from RT Info for segmentation task +- Prevent using too low confidence thresholds in detection Bug fixes ^^^^^^^^^ @@ -48,6 +51,10 @@ Bug fixes - Add missing tile recipes and various tile recipe changes - Change categories mapping logic - Fix config converter for tiling +- Fix num_trials calculation on dataset length less than num_class +- Fix out_features in HierarchicalCBAMClsHead +- Fix multilabel_accuracy of MixedHLabelAccuracy +- Fix wrong indices setting in HLabelInfo v2.1.0 (2024.07) ---------------- diff --git a/pyproject.toml b/pyproject.toml index c9eba50c14f..d32427f7149 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -81,8 +81,8 @@ xpu = [ "timm==1.0.3", "openvino==2024.4", "openvino-dev==2024.4", - "openvino-model-api==0.2.4", - "onnx==1.16.2", + "openvino-model-api==0.2.5", + "onnx==1.17.0", "onnxconverter-common==1.14.0", "nncf==2.13.0", "anomalib[core]==1.1.0", @@ -96,7 +96,7 @@ base = [ "openvino==2024.4", "openvino-dev==2024.4", "openvino-model-api==0.2.4", - "onnx==1.16.2", + "onnx==1.17.0", "onnxconverter-common==1.14.0", "nncf==2.13.0", "anomalib[core]==1.1.0", diff --git a/src/otx/algo/classification/heads/hlabel_cls_head.py b/src/otx/algo/classification/heads/hlabel_cls_head.py index f1041d06079..71268bb9ea0 100644 --- a/src/otx/algo/classification/heads/hlabel_cls_head.py +++ b/src/otx/algo/classification/heads/hlabel_cls_head.py @@ -355,7 +355,7 @@ def __init__( self.fc_superclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_multiclass_heads) self.attention_fc = nn.Linear(num_multiclass_heads, in_channels * self.step_size[0] * self.step_size[1]) self.cbam = CBAM(in_channels) - self.fc_subclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_single_label_classes) + self.fc_subclass = nn.Linear(in_channels * self.step_size[0] * self.step_size[1], num_classes) self._init_layers() diff --git a/src/otx/algo/common/layers/position_embed.py b/src/otx/algo/common/layers/position_embed.py index 5afe6010a5d..d875e68a25a 100644 --- a/src/otx/algo/common/layers/position_embed.py +++ b/src/otx/algo/common/layers/position_embed.py @@ -70,60 +70,6 @@ def forward(self, tensor_list: NestedTensor | torch.Tensor) -> torch.Tensor: return torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) -class PositionEmbeddingLearned(nn.Module): - """Absolute pos embedding, learned.""" - - def __init__(self, num_pos_feats: int = 256): - """Positional embedding.""" - super().__init__() - self.row_embed = nn.Embedding(50, num_pos_feats) - self.col_embed = nn.Embedding(50, num_pos_feats) - - def forward(self, tensor_list: NestedTensor) -> torch.Tensor: - """Forward pass of the PositionEmbeddingLearned module. - - Args: - tensor_list (NestedTensor): Input tensor. - - Returns: - torch.Tensor: Position embeddings. - """ - x = tensor_list.tensors - h, w = x.shape[-2:] - i = torch.arange(w, device=x.device) / w * 49 - j = torch.arange(h, device=x.device) / h * 49 - x_emb = self.get_embed(i, self.col_embed) - y_emb = self.get_embed(j, self.row_embed) - return ( - torch.cat( - [ - x_emb.unsqueeze(0).repeat(h, 1, 1), - y_emb.unsqueeze(1).repeat(1, w, 1), - ], - dim=-1, - ) - .permute(2, 0, 1) - .unsqueeze(0) - .repeat(x.shape[0], 1, 1, 1) - ) - - def get_embed(self, coord: torch.Tensor, embed: nn.Embedding) -> torch.Tensor: - """Get the embedding for the given coordinates. - - Args: - coord (torch.Tensor): The coordinates. - embed (nn.Embedding): The embedding layer. - - Returns: - torch.Tensor: The embedding for the coordinates. - """ - floor_coord = coord.floor() - delta = (coord - floor_coord).unsqueeze(-1) - floor_coord = floor_coord.long() - ceil_coord = (floor_coord + 1).clamp(max=49) - return embed(floor_coord) * (1 - delta) + embed(ceil_coord) * delta - - def gen_sineembed_for_position(pos_tensor: torch.Tensor) -> torch.Tensor: """Generate sine embeddings for position tensor. diff --git a/src/otx/algo/common/losses/cross_focal_loss.py b/src/otx/algo/common/losses/cross_focal_loss.py index bfec15c0c84..457876a5986 100644 --- a/src/otx/algo/common/losses/cross_focal_loss.py +++ b/src/otx/algo/common/losses/cross_focal_loss.py @@ -8,7 +8,7 @@ import torch import torch.nn.functional from torch import Tensor, nn -from torch.cuda.amp import custom_fwd +from torch.amp import custom_fwd from .focal_loss import py_sigmoid_focal_loss @@ -79,7 +79,7 @@ def __init__( self.cls_criterion = cross_sigmoid_focal_loss - @custom_fwd(cast_inputs=torch.float32) + @custom_fwd(cast_inputs=torch.float32, device_type="cuda") def forward( self, pred: Tensor, diff --git a/src/otx/algo/instance_segmentation/rtmdet_inst.py b/src/otx/algo/instance_segmentation/rtmdet_inst.py index 9bf85fbcbe9..3625d46f874 100644 --- a/src/otx/algo/instance_segmentation/rtmdet_inst.py +++ b/src/otx/algo/instance_segmentation/rtmdet_inst.py @@ -130,6 +130,7 @@ def _exporter(self) -> OTXModelExporter: "opset_version": 11, "autograd_inlining": False, }, + # TODO(Eugene): Add XAI support for RTMDetInst output_names=["bboxes", "labels", "masks", "feature_vector", "saliency_map"] if self.explain_mode else None, ) diff --git a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py index 02be0e943c7..0911d26050d 100644 --- a/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py +++ b/src/otx/algo/object_detection_3d/backbones/monodetr_resnet.py @@ -9,38 +9,14 @@ import torch import torchvision from torch import nn +from torchvision.models import get_model_weights from torchvision.models._utils import IntermediateLayerGetter -from otx.algo.common.layers.position_embed import PositionEmbeddingLearned, PositionEmbeddingSine +from otx.algo.common.layers.position_embed import PositionEmbeddingSine from otx.algo.modules.norm import FrozenBatchNorm2d from otx.algo.object_detection_3d.utils.utils import NestedTensor -def build_position_encoding( - hidden_dim: int, - position_embedding: str | PositionEmbeddingSine | PositionEmbeddingLearned, -) -> PositionEmbeddingSine | PositionEmbeddingLearned: - """Build the position encoding module. - - Args: - hidden_dim (int): The hidden dimension. - position_embedding (Union[str, PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding type. - - Returns: - Union[PositionEmbeddingSine, PositionEmbeddingLearned]: The position encoding module. - """ - n_steps = hidden_dim // 2 - if position_embedding in ("v2", "sine"): - position_embedding = PositionEmbeddingSine(n_steps, normalize=True) - elif position_embedding in ("v3", "learned"): - position_embedding = PositionEmbeddingLearned(n_steps) - else: - msg = f"not supported {position_embedding}" - raise ValueError(msg) - - return position_embedding - - class BackboneBase(nn.Module): """BackboneBase module.""" @@ -85,7 +61,7 @@ def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, norm_layer = FrozenBatchNorm2d backbone = getattr(torchvision.models, name)( replace_stride_with_dilation=[False, False, dilation], - pretrained=True, + weights=get_model_weights(name).IMAGENET1K_V1, # the same as pretrained=True norm_layer=norm_layer, ) super().__init__(backbone, train_backbone, return_interm_layers) @@ -99,13 +75,13 @@ class Joiner(nn.Sequential): def __init__( self, backbone: nn.Module, - position_embedding: PositionEmbeddingSine | PositionEmbeddingLearned, + position_embedding: PositionEmbeddingSine, ) -> None: """Initialize the Joiner module. Args: backbone (nn.Module): The backbone module. - position_embedding (Union[PositionEmbeddingSine, PositionEmbeddingLearned]): The position embedding module. + position_embedding (PositionEmbeddingSine): The position embedding module. """ super().__init__(backbone, position_embedding) self.strides = backbone.strides @@ -135,7 +111,6 @@ class BackboneBuilder: "return_interm_layers": True, "positional_encoding": { "hidden_dim": 256, - "position_embedding": "sine", }, }, } @@ -144,5 +119,6 @@ def __new__(cls, model_name: str) -> Joiner: """Constructor for Backbone MonoDetr.""" # TODO (Kirill): change backbone to already implemented in OTX backbone = Backbone(**cls.CFG[model_name]) - position_embedding = build_position_encoding(**cls.CFG[model_name]["positional_encoding"]) + n_steps = cls.CFG[model_name]["positional_encoding"]["hidden_dim"] // 2 + position_embedding = PositionEmbeddingSine(n_steps, normalize=True) return Joiner(backbone, position_embedding) diff --git a/src/otx/algo/object_detection_3d/detectors/monodetr.py b/src/otx/algo/object_detection_3d/detectors/monodetr.py index b102a054fe3..3b05a90827a 100644 --- a/src/otx/algo/object_detection_3d/detectors/monodetr.py +++ b/src/otx/algo/object_detection_3d/detectors/monodetr.py @@ -25,10 +25,10 @@ def __init__( backbone: nn.Module, depthaware_transformer: nn.Module, depth_predictor: nn.Module, - criterion: nn.Module, num_classes: int, num_queries: int, num_feature_levels: int, + criterion: nn.Module | None = None, aux_loss: bool = True, with_box_refine: bool = False, init_box: bool = False, @@ -41,7 +41,7 @@ def __init__( backbone (nn.Module): torch module of the backbone to be used. See backbone.py depthaware_transformer (nn.Module): depth-aware transformer architecture. See depth_aware_transformer.py depth_predictor (nn.Module): depth predictor module - criterion (nn.Module): loss criterion module + criterion (nn.Module | None): loss criterion module num_classes (int): number of object classes num_queries (int): number of object queries, ie detection slot. This is the maximal number of objects DETR can detect in a single image. For KITTI, we recommend 50 queries. @@ -149,12 +149,17 @@ def forward( """Forward method of the MonoDETR model. Args: - images (list[Tensor]): images for each sample - calibs (Tensor): camera matrices for each sample - img_sizes (Tensor): image sizes for each sample - targets (list[dict[Tensor]): ground truth boxes and labels for each - sample + images (Tensor): images for each sample. + calibs (Tensor): camera matrices for each sample. + img_sizes (Tensor): image sizes for each sample. + targets (list[dict[str, Tensor]): ground truth boxes and labels for each + sample. Defaults to None. mode (str): The mode of operation. Defaults to "predict". + + Returns: + dict[str, Tensor]: A dictionary of tensors. If mode is "loss", the + tensors are the loss values. If mode is "predict", the tensors are + the logits. """ features, pos = self.backbone(images) @@ -230,7 +235,7 @@ def forward( # depth_geo box2d_height_norm = outputs_coord[:, :, 4] + outputs_coord[:, :, 5] - box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, 1:2], min=1.0) + box2d_height = torch.clamp(box2d_height_norm * img_sizes[:, :1], min=1.0) depth_geo = size3d[:, :, 0] / box2d_height * calibs[:, 0, 0].unsqueeze(1) # depth_reg @@ -285,6 +290,9 @@ def forward( ) if mode == "loss": + if self.criterion is None: + msg = "Criterion is not set for the model" + raise ValueError(msg) return self.criterion(outputs=out, targets=targets) return out diff --git a/src/otx/algo/object_detection_3d/heads/depth_predictor.py b/src/otx/algo/object_detection_3d/heads/depth_predictor.py index 4e5037c96d8..87827144b21 100644 --- a/src/otx/algo/object_detection_3d/heads/depth_predictor.py +++ b/src/otx/algo/object_detection_3d/heads/depth_predictor.py @@ -32,6 +32,8 @@ def __init__( depth_min (float): The minimum depth value. depth_max (float): The maximum depth value. hidden_dim (int): The dimension of the hidden layer. + activation (Callable[..., nn.Module], optional): The activation function. + Defaults to nn.ReLU. """ super().__init__() self.depth_max = depth_max diff --git a/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py index 4269ba1950d..7592c312c05 100644 --- a/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py +++ b/src/otx/algo/object_detection_3d/heads/depthaware_transformer.py @@ -4,7 +4,6 @@ """depth aware transformer head for 3d object detection.""" from __future__ import annotations -import math from typing import Any, Callable, ClassVar import torch @@ -101,84 +100,6 @@ def _reset_parameters(self) -> None: constant_(self.reference_points.bias.data, 0.0) normal_(self.level_embed) - def get_proposal_pos_embed(self, proposals: Tensor) -> Tensor: - """Generate position embeddings for proposal tensor. - - Args: - proposals (Tensor): Proposal tensor of shape (N, L, 6). - - TODO (Kirill): Not used. Remove this function? - - Returns: - Tensor: Position embeddings for proposal tensor of shape (N, L, embedding_dim). - """ - num_pos_feats = 128 - temperature = 10000 - scale = 2 * math.pi - - dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=proposals.device) - dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats) - # N, L, 6 - proposals = proposals.sigmoid() * scale - # N, L, 6, 128 - pos = proposals[:, :, :, None] / dim_t - # N, L, 6, 64, 2 - return torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2) - - def gen_encoder_output_proposals( - self, - memory: Tensor, - memory_padding_mask: Tensor, - spatial_shapes: list[tuple[int, int]], - ) -> tuple[Tensor, Tensor]: - """Generate encoder output and proposals. - - Args: - memory (Tensor): Memory tensor of shape (N, S, C). - memory_padding_mask (Tensor): Memory padding mask tensor of shape (N, S). - spatial_shapes (List[Tuple[int, int]]): List of spatial shapes. - - TODO (Kirill): Not used. Remove this function? - - Returns: - Tuple[Tensor, Tensor]: Encoder output tensor of shape (N, S, C) and proposals tensor of shape (N, L, 6). - """ - n_, _, _ = memory.shape - proposals = [] - _cur = 0 - for lvl, (h_, w_) in enumerate(spatial_shapes): - mask_flatten_ = memory_padding_mask[:, _cur : (_cur + h_ * w_)].view(n_, h_, w_, 1) - valid_h = torch.sum(~mask_flatten_[:, :, 0, 0], 1) - valid_w = torch.sum(~mask_flatten_[:, 0, :, 0], 1) - - grid_y, grid_x = torch.meshgrid( - torch.linspace(0, h_ - 1, h_, dtype=torch.float32, device=memory.device), - torch.linspace(0, w_ - 1, w_, dtype=torch.float32, device=memory.device), - ) - grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) - - scale = torch.cat([valid_w.unsqueeze(-1), valid_h.unsqueeze(-1)], 1).view(n_, 1, 1, 2) - grid = (grid.unsqueeze(0).expand(n_, -1, -1, -1) + 0.5) / scale - - lr = torch.ones_like(grid) * 0.05 * (2.0**lvl) - tb = torch.ones_like(grid) * 0.05 * (2.0**lvl) - wh = torch.cat((lr, tb), -1) - - proposal = torch.cat((grid, wh), -1).view(n_, -1, 6) - proposals.append(proposal) - _cur += h_ * w_ - output_proposals = torch.cat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) - output_proposals = torch.log(output_proposals / (1 - output_proposals)) - output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) - output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) - - output_memory = memory - output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) - output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) - output_memory = self.enc_output_norm(self.enc_output(output_memory)) - return output_memory, output_proposals - def get_valid_ratio(self, mask: Tensor) -> Tensor: """Calculate the valid ratio of the mask. @@ -616,7 +537,7 @@ def forward( intermediate_reference_dims, ) - return output, reference_points + return output, reference_points, None class DepthAwareTransformerBuilder: diff --git a/src/otx/algo/object_detection_3d/losses/ddn_loss.py b/src/otx/algo/object_detection_3d/losses/ddn_loss.py index e3a4238be03..671033a347a 100644 --- a/src/otx/algo/object_detection_3d/losses/ddn_loss.py +++ b/src/otx/algo/object_detection_3d/losses/ddn_loss.py @@ -22,13 +22,13 @@ def compute_fg_mask( """Compute foreground mask for images. Args: - gt_boxes2d [torch.Tensor(B, N, 4)]: 2D box labels - shape [Tuple[int, int]]: Foreground mask desired shape - downsample_factor [int]: Downsample factor for image - device [torch.device]: Foreground mask desired device + gt_boxes2d (torch.Tensor): 2D box labels. + shape (Tuple[int, int]): Foreground mask desired shape. + downsample_factor (int): Downsample factor for image. + device (torch.device): Foreground mask desired device. Returns: - fg_mask [torch.Tensor(shape)]: Foreground mask + fg_mask (torch.Tensor(shape)]: Foreground mask. """ if device is None: device = torch.device("cpu") @@ -58,9 +58,9 @@ def __init__(self, fg_weight: float, bg_weight: float, downsample_factor: int = """Initialize fixed foreground/background loss balancer. Args: - fg_weight [float]: Foreground loss weight - bg_weight [float]: Background loss weight - downsample_factor [int]: Depth map downsample factor + fg_weight (float): Foreground loss weight. + bg_weight (float): Background loss weight. + downsample_factor (int): Depth map downsample factor. """ super().__init__() self.fg_weight = fg_weight @@ -76,12 +76,11 @@ def forward( """Forward pass. Args: - loss [torch.Tensor(B, H, W)]: Pixel-wise loss - gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing + loss (torch.Tensor): Pixel-wise loss. + gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing. Returns: - loss [torch.Tensor(1)]: Total loss after foreground/background balancing - tb_dict [dict[float]]: All losses to log in tensorboard + loss (torch.Tensor): Total loss after foreground/background balancing. """ # Compute masks fg_mask = compute_fg_mask( @@ -120,13 +119,11 @@ def __init__( """Initializes DDNLoss module. Args: - weight [float]: Loss function weight - alpha [float]: Alpha value for Focal Loss - gamma [float]: Gamma value for Focal Loss - disc_cfg [dict]: Depth discretiziation configuration - fg_weight [float]: Foreground loss weight - bg_weight [float]: Background loss weight - downsample_factor [int]: Depth map downsample factor + alpha (float): Alpha value for Focal Loss. + gamma (float): Gamma value for Focal Loss. + fg_weight (float): Foreground loss weight. + bg_weight (float): Background loss weight. + downsample_factor (int): Depth map downsample factor. """ super().__init__() self.balancer = Balancer(downsample_factor=downsample_factor, fg_weight=fg_weight, bg_weight=bg_weight) @@ -146,10 +143,10 @@ def build_target_depth_from_3dcenter( """Builds target depth map from 3D center depth. Args: - depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits - gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing - gt_center_depth [torch.Tensor(B, N)]: 3D center depth - num_gt_per_img: [int]: Number of ground truth boxes per image + depth_logits: (torch.Tensor): Predicted depth logits. + gt_boxes2d (torch.Tensor)): 2D box labels for foreground/background balancing. + gt_center_depth (torch.Tensor): 3D center depth. + num_gt_per_img: (int): Number of ground truth boxes per image. """ b, _, h, w = depth_logits.shape depth_maps = torch.zeros((b, h, w), device=depth_logits.device, dtype=depth_logits.dtype) @@ -185,18 +182,18 @@ def bin_depths( """Converts depth map into bin indices. Args: - depth_map [torch.Tensor(H, W)]: Depth Map - mode [string]: Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details) - UD: Uniform discretiziation - LID: Linear increasing discretiziation - SID: Spacing increasing discretiziation - depth_min [float]: Minimum depth value - depth_max [float]: Maximum depth value - num_bins [int]: Number of depth bins - target [bool]: Whether the depth bins indices will be used for a target tensor in loss comparison + depth_map (torch.Tensor): Depth Map. + mode (string): Discretiziation mode (See https://arxiv.org/pdf/2005.13423.pdf for more details). + UD: Uniform discretiziation. + LID: Linear increasing discretiziation. + SID: Spacing increasing discretiziation. + depth_min (float): Minimum depth value. + depth_max (float): Maximum depth value. + num_bins (int): Number of depth bins. + target (bool): Whether the depth bins indices will be used for a target tensor in loss comparison. Returns: - indices [torch.Tensor(H, W)]: Depth bin indices + indices (torch.Tensor): Depth bin indices. """ if mode == "UD": bin_size = (depth_max - depth_min) / num_bins @@ -233,13 +230,13 @@ def forward( """Gets depth_map loss. Args: - depth_logits: torch.Tensor(B, D+1, H, W)]: Predicted depth logits - gt_boxes2d [torch.Tensor (B, N, 4)]: 2D box labels for foreground/background balancing - num_gt_per_img: [int]: Number of ground truth boxes per image - gt_center_depth: [torch.Tensor(B, N)]: 3D center depth + depth_logits: (torch.Tensor): Predicted depth logits. + gt_boxes2d (torch.Tensor): 2D box labels for foreground/background balancing. + num_gt_per_img: (int): Number of ground truth boxes per image. + gt_center_depth: (torch.Tensor): 3D center depth. Returns: - loss [torch.Tensor(1)]: Depth classification network loss + loss (torch.Tensor): Depth classification network loss. """ # Bin depth map to create target depth_maps = self.build_target_depth_from_3dcenter(depth_logits, gt_boxes2d, gt_center_depth, num_gt_per_img) diff --git a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py index ebc98d45a51..0f2d85d0565 100644 --- a/src/otx/algo/object_detection_3d/losses/monodetr_loss.py +++ b/src/otx/algo/object_detection_3d/losses/monodetr_loss.py @@ -29,11 +29,10 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou """MonoDETRCriterion. Args: - num_classes: number of object categories, omitting the special no-object category - matcher: module able to compute a matching between targets and proposals - weight_dict: dict containing as key the names of the losses and as values their relative weight. - focal_alpha: alpha in Focal Loss - group_num: number of groups for data parallelism + num_classes (int): number of object categories, omitting the special no-object category. + weight_dict (dict): dict containing as key the names of the losses and as values their relative weight. + focal_alpha (float): alpha in Focal Loss. + group_num (int): number of groups for data parallelism. """ super().__init__() self.num_classes = num_classes @@ -47,7 +46,15 @@ def __init__(self, num_classes: int, weight_dict: dict, focal_alpha: float, grou self.group_num = group_num def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Classification loss.""" + """Classification loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ src_logits = outputs["scores"] idx = self._get_src_permutation_idx(indices) @@ -76,7 +83,15 @@ def loss_labels(self, outputs: dict, targets: list, indices: list, num_boxes: in return {"loss_ce": loss_ce} def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the 3D center prediction.""" + """Compute the loss for the 3D center prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) src_3dcenter = outputs["boxes_3d"][:, :, 0:2][idx] target_3dcenter = torch.cat([t["boxes_3d"][:, 0:2][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -85,7 +100,15 @@ def loss_3dcenter(self, outputs: dict, targets: list, indices: list, num_boxes: return {"loss_center": loss_3dcenter.sum() / num_boxes} def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute l1 loss.""" + """Compute l1 loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) src_2dboxes = outputs["boxes_3d"][:, :, 2:6][idx] target_2dboxes = torch.cat([t["boxes_3d"][:, 2:6][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -95,7 +118,15 @@ def loss_boxes(self, outputs: dict, targets: list, indices: list, num_boxes: int return {"loss_bbox": loss_bbox.sum() / num_boxes} def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the GIoU loss.""" + """Compute the GIoU loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ # giou idx = self._get_src_permutation_idx(indices) src_boxes = outputs["boxes_3d"][idx] @@ -104,7 +135,15 @@ def loss_giou(self, outputs: dict, targets: list, indices: list, num_boxes: int) return {"loss_giou": loss_giou} def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the depth prediction.""" + """Compute the loss for the depth prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch + """ idx = self._get_src_permutation_idx(indices) src_depths = outputs["depth"][idx] @@ -117,7 +156,15 @@ def loss_depths(self, outputs: dict, targets: list, indices: list, num_boxes: in return {"loss_depth": depth_loss.sum() / num_boxes} def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the dimension prediction.""" + """Compute the loss for the dimension prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) src_dims = outputs["size_3d"][idx] target_dims = torch.cat([t["size_3d"][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -131,7 +178,15 @@ def loss_dims(self, outputs: dict, targets: list, indices: list, num_boxes: int) return {"loss_dim": dim_loss.sum() / num_boxes} def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Compute the loss for the angle prediction.""" + """Compute the loss for the angle prediction. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ idx = self._get_src_permutation_idx(indices) heading_input = outputs["heading_angle"][idx] target_heading_angle = torch.cat([t["heading_angle"][i] for t, (_, i) in zip(targets, indices)], dim=0) @@ -158,7 +213,15 @@ def loss_angles(self, outputs: dict, targets: list, indices: list, num_boxes: in return {"loss_angle": angle_loss.sum() / num_boxes} def loss_depth_map(self, outputs: dict, targets: list, indices: list, num_boxes: int) -> dict[str, Tensor]: - """Depth map loss.""" + """Depth map loss. + + Args: + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. + indices (list): list of tuples, such that len(indices) == batch_size. + num_boxes (int): number of boxes in the batch. + """ depth_map_logits = outputs["pred_depth_map_logits"] num_gt_per_img = [len(t["boxes"]) for t in targets] @@ -174,6 +237,7 @@ def _get_src_permutation_idx( self, indices: list[tuple[torch.Tensor, torch.Tensor]], ) -> tuple[torch.Tensor, torch.Tensor]: + """Get the indices necessary to compute the loss.""" # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) @@ -183,6 +247,7 @@ def _get_tgt_permutation_idx( self, indices: list[tuple[torch.Tensor, torch.Tensor]], ) -> tuple[torch.Tensor, torch.Tensor]: + """Get the indices necessary to compute the loss.""" # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) @@ -210,9 +275,9 @@ def forward( """This performs the loss computation. Args: - outputs: dict of tensors, see the output specification of the model for the format - targets: list of dicts, such that len(targets) == batch_size. - The expected keys in each dict depends on the losses applied, see each loss' doc + outputs (dict): dict of tensors, see the output specification of the model for the format. + targets (list): list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc. """ outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} group_num = self.group_num if self.training else 1 diff --git a/src/otx/algo/object_detection_3d/monodetr3d.py b/src/otx/algo/object_detection_3d/monodetr3d.py index 2ea42e52f95..18d3c072556 100644 --- a/src/otx/algo/object_detection_3d/monodetr3d.py +++ b/src/otx/algo/object_detection_3d/monodetr3d.py @@ -7,19 +7,13 @@ from typing import Any -import numpy as np import torch -from torch import Tensor -from torchvision.ops import box_convert from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder from otx.algo.object_detection_3d.losses import MonoDETRCriterion -from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy -from otx.core.data.entity.base import OTXBatchLossEntity -from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity from otx.core.exporter.base import OTXModelExporter from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter from otx.core.model.detection_3d import OTX3DDetectionModel @@ -28,9 +22,8 @@ class MonoDETR3D(OTX3DDetectionModel): """OTX Detection model class for MonoDETR3D.""" - mean: tuple[float, float, float] = (0.485, 0.456, 0.406) - std: tuple[float, float, float] = (0.229, 0.224, 0.225) - input_size: tuple[int, int] = (384, 1280) # HxW + mean: tuple[float, float, float] = (123.675, 116.28, 103.53) + std: tuple[float, float, float] = (58.395, 57.12, 57.375) load_from: str | None = None def _build_model(self, num_classes: int) -> MonoDETR: @@ -62,73 +55,6 @@ def _build_model(self, num_classes: int) -> MonoDETR: init_box=False, ) - def _customize_inputs( - self, - entity: Det3DBatchDataEntity, - ) -> dict[str, Any]: - # prepare bboxes for the model - targets_list = [] - img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to( - device=entity.images.device, - ) - key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"] - for bz in range(len(entity.imgs_info)): - target_dict = {} - for key in key_list: - target_dict[key] = getattr(entity, key)[bz] - targets_list.append(target_dict) - - return { - "images": entity.images, - "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0), - "targets": targets_list, - "img_sizes": img_sizes, - "mode": "loss" if self.training else "predict", - } - - def _customize_outputs( - self, - outputs: dict[str, torch.Tensor], - inputs: Det3DBatchDataEntity, - ) -> Det3DBatchPredEntity | OTXBatchLossEntity: - if self.training: - if not isinstance(outputs, dict): - raise TypeError(outputs) - - losses = OTXBatchLossEntity() - for k, v in outputs.items(): - if isinstance(v, list): - losses[k] = sum(v) - elif isinstance(v, Tensor): - losses[k] = v - else: - msg = "Loss output should be list or torch.tensor but got {type(v)}" - raise TypeError(msg) - return losses - - labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs) - # bbox 2d decoding - boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d) - xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh") - # size 2d decoding - size_2d = xywh_2d[:, :, 2:4] - - return Det3DBatchPredEntity( - batch_size=inputs.batch_size, - images=inputs.images, - imgs_info=inputs.imgs_info, - calib_matrix=inputs.calib_matrix, - boxes=boxes_2d, - labels=labels, - boxes_3d=boxes_3d, - size_2d=size_2d, - size_3d=size_3d, - depth=depth, - heading_angle=heading_angle, - scores=scores, - original_kitti_format=[None], - ) - def configure_optimizers(self) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: """Configure an optimizer and learning-rate schedulers. @@ -240,7 +166,7 @@ def _exporter(self) -> OTXModelExporter: "opset_version": 16, }, input_names=["images", "calib_matrix", "img_sizes"], - output_names=["scores", "boxes_3d", "size_3d", "heading_angle", "depth"], + output_names=["scores", "boxes_3d", "size_3d", "depth", "heading_angle"], ) @property diff --git a/src/otx/algo/samplers/balanced_sampler.py b/src/otx/algo/samplers/balanced_sampler.py index 287bbf1dcf4..4b6cfb56caa 100644 --- a/src/otx/algo/samplers/balanced_sampler.py +++ b/src/otx/algo/samplers/balanced_sampler.py @@ -65,7 +65,7 @@ def __init__( self.img_indices = {k: torch.tensor(v, dtype=torch.int64) for k, v in ann_stats.items() if len(v) > 0} self.num_cls = len(self.img_indices.keys()) self.data_length = len(self.dataset) - self.num_trials = int(self.data_length / self.num_cls) + self.num_trials = max(int(self.data_length / self.num_cls), 1) if efficient_mode: # Reduce the # of sampling (sampling data for a single epoch) diff --git a/src/otx/algo/segmentation/huggingface_model.py b/src/otx/algo/segmentation/huggingface_model.py index 428a489e1fa..dc7f7f9b242 100644 --- a/src/otx/algo/segmentation/huggingface_model.py +++ b/src/otx/algo/segmentation/huggingface_model.py @@ -162,4 +162,8 @@ def _exporter(self) -> OTXModelExporter: def forward_for_tracing(self, image: torch.Tensor) -> torch.Tensor | dict[str, torch.Tensor]: """Model forward function used for the model tracing during model exportation.""" + if self.explain_mode: + msg = "Explain mode is not supported for this model." + raise NotImplementedError(msg) + return self.model(image) diff --git a/src/otx/algo/segmentation/litehrnet.py b/src/otx/algo/segmentation/litehrnet.py index 33269e04532..fd153877ccd 100644 --- a/src/otx/algo/segmentation/litehrnet.py +++ b/src/otx/algo/segmentation/litehrnet.py @@ -81,7 +81,7 @@ def _exporter(self) -> OTXModelExporter: swap_rgb=False, via_onnx=False, onnx_export_configuration={"operator_export_type": OperatorExportTypes.ONNX_ATEN_FALLBACK}, - output_names=None, + output_names=["preds", "feature_vector"] if self.explain_mode else None, ) @property diff --git a/src/otx/algo/segmentation/segmentors/base_model.py b/src/otx/algo/segmentation/segmentors/base_model.py index 9cad16b45ea..370ac795d73 100644 --- a/src/otx/algo/segmentation/segmentors/base_model.py +++ b/src/otx/algo/segmentation/segmentors/base_model.py @@ -10,6 +10,8 @@ import torch.nn.functional as f from torch import Tensor, nn +from otx.algo.explain.explain_algo import feature_vector_fn + if TYPE_CHECKING: from otx.core.data.entity.base import ImageInfo @@ -58,7 +60,7 @@ def forward( - If mode is "predict", returns the predicted outputs. - Otherwise, returns the model outputs after interpolation. """ - outputs = self.extract_features(inputs) + enc_feats, outputs = self.extract_features(inputs) outputs = f.interpolate(outputs, size=inputs.size()[2:], mode="bilinear", align_corners=True) if mode == "tensor": @@ -76,12 +78,19 @@ def forward( if mode == "predict": return outputs.argmax(dim=1) + if mode == "explain": + feature_vector = feature_vector_fn(enc_feats) + return { + "preds": outputs, + "feature_vector": feature_vector, + } + return outputs - def extract_features(self, inputs: Tensor) -> Tensor: + def extract_features(self, inputs: Tensor) -> tuple[Tensor, Tensor]: """Extract features from the backbone and head.""" enc_feats = self.backbone(inputs) - return self.decode_head(enc_feats) + return enc_feats, self.decode_head(enc_feats) def calculate_loss( self, diff --git a/src/otx/core/config/data.py b/src/otx/core/config/data.py index 5af3016e0e8..111e76e9261 100644 --- a/src/otx/core/config/data.py +++ b/src/otx/core/config/data.py @@ -29,6 +29,8 @@ class SubsetConfig: (`TransformLibType.MMCV`, `TransformLibType.MMPRETRAIN`, ...). transform_lib_type (TransformLibType): Transform library type used by this subset. num_workers (int): Number of workers for the dataloader of this subset. + sampler (SamplerConfig | None): Sampler configuration for the dataloader of this subset. + to_tv_image (bool): Whether to convert image to torch tensor. input_size (int | tuple[int, int] | None) : input size model expects. If $(input_size) exists in transforms, it will be replaced with this value. diff --git a/src/otx/core/config/hpo.py b/src/otx/core/config/hpo.py index 8d4dd085955..29695631ef8 100644 --- a/src/otx/core/config/hpo.py +++ b/src/otx/core/config/hpo.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from pathlib import Path # noqa: TCH003 -from typing import Any, Literal +from typing import Any, Callable, Literal import torch @@ -23,7 +23,12 @@ @dataclass class HpoConfig: - """DTO for HPO configuration.""" + """DTO for HPO configuration. + + progress_update_callback (Callable[[int | float], None] | None): + callback to update progress. If it's given, it's called with progress every second. + callbacks_to_exclude (list[str] | str | None): List of name of callbacks to exclude during HPO. + """ search_space: dict[str, dict[str, Any]] | str | Path | None = None save_path: str | None = None @@ -40,3 +45,5 @@ class HpoConfig: asynchronous_sha: bool = num_workers > 1 metric_name: str | None = None adapt_bs_search_space_max_val: Literal["None", "Safe", "Full"] = "None" + progress_update_callback: Callable[[int | float], None] | None = None + callbacks_to_exclude: list[str] | str | None = None diff --git a/src/otx/core/data/dataset/object_detection_3d.py b/src/otx/core/data/dataset/object_detection_3d.py index 7e7f294c58b..06df0136392 100644 --- a/src/otx/core/data/dataset/object_detection_3d.py +++ b/src/otx/core/data/dataset/object_detection_3d.py @@ -3,8 +3,6 @@ # """Module for OTX3DObjectDetectionDataset.""" -# mypy: ignore-errors - from __future__ import annotations from copy import deepcopy @@ -12,12 +10,8 @@ from typing import TYPE_CHECKING, Any, Callable, List, Union import numpy as np -import torch from datumaro import Image -from PIL import Image as PILImage -from torchvision import tv_tensors -from otx.core.data.dataset.utils.kitti_utils import Calibration, affine_transform, angle2class, get_affine_transform from otx.core.data.entity.base import ImageInfo from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DDataEntity from otx.core.data.mem_cache import NULL_MEM_CACHE_HANDLER, MemCacheHandlerBase @@ -27,7 +21,7 @@ from .base import OTXDataset if TYPE_CHECKING: - from datumaro import Bbox, DatasetSubset + from datumaro import DatasetSubset Transforms = Union[Compose, Callable, List[Callable], dict[str, Compose | Callable | List[Callable]]] @@ -45,10 +39,8 @@ def __init__( max_refetch: int = 1000, image_color_channel: ImageColorChannel = ImageColorChannel.RGB, stack_images: bool = True, - to_tv_image: bool = True, + to_tv_image: bool = False, max_objects: int = 50, - depth_threshold: int = 65, - resolution: tuple[int, int] = (1280, 384), # (W, H) ) -> None: super().__init__( dm_subset, @@ -61,239 +53,56 @@ def __init__( to_tv_image, ) self.max_objects = max_objects - self.depth_threshold = depth_threshold - self.resolution = np.array(resolution) # TODO(Kirill): make it configurable self.subset_type = list(self.dm_subset.get_subset_info())[-1].split(":")[0] def _get_item_impl(self, index: int) -> Det3DDataEntity | None: entity = self.dm_subset[index] image = entity.media_as(Image) - image = self._get_img_data_and_shape(image)[0] - calib = Calibration(entity.attributes["calib_path"]) - original_kitti_format = None # don't use for training - if self.subset_type != "train": - # TODO (Kirill): remove this or duplication of the inputs - annotations_copy = deepcopy(entity.annotations) - original_kitti_format = [obj.attributes for obj in annotations_copy] - # decode original kitti format for metric calculation - for i, anno_dict in enumerate(original_kitti_format): - anno_dict["name"] = self.label_info.label_names[annotations_copy[i].label] - anno_dict["bbox"] = annotations_copy[i].points - dimension = anno_dict["dimensions"] - anno_dict["dimensions"] = [dimension[2], dimension[0], dimension[1]] - original_kitti_format = self._reformate_for_kitti_metric(original_kitti_format) - # decode labels for training - inputs, targets, ori_img_shape = self._decode_item( - PILImage.fromarray(image), - entity.annotations, - calib, - ) - # normilize image - inputs = self._apply_transforms(torch.as_tensor(inputs, dtype=torch.float32)) - return Det3DDataEntity( - image=inputs, + image, ori_img_shape = self._get_img_data_and_shape(image) + calib = self.get_calib_from_file(entity.attributes["calib_path"]) + annotations_copy = deepcopy(entity.annotations) + datumaro_kitti_format = [obj.attributes for obj in annotations_copy] + + # decode original kitti format for metric calculation + for i, anno_dict in enumerate(datumaro_kitti_format): + anno_dict["name"] = ( + self.label_info.label_names[annotations_copy[i].label] + if self.subset_type != "train" + else annotations_copy[i].label + ) + anno_dict["bbox"] = annotations_copy[i].points + dimension = anno_dict["dimensions"] + anno_dict["dimensions"] = [dimension[2], dimension[0], dimension[1]] + original_kitti_format = self._reformate_for_kitti_metric(datumaro_kitti_format) + + entity = Det3DDataEntity( + image=image, img_info=ImageInfo( img_idx=index, - img_shape=inputs.shape[1:], - ori_shape=ori_img_shape, # TODO(Kirill): curently we use WxH here, make it HxW + img_shape=ori_img_shape, + ori_shape=ori_img_shape, image_color_channel=self.image_color_channel, ignored_labels=[], ), - boxes=tv_tensors.BoundingBoxes( - targets["boxes"], - format=tv_tensors.BoundingBoxFormat.XYXY, - canvas_size=inputs.shape[1:], - dtype=torch.float32, - ), - labels=torch.as_tensor(targets["labels"], dtype=torch.long), - calib_matrix=torch.as_tensor(calib.P2, dtype=torch.float32), - boxes_3d=torch.as_tensor(targets["boxes_3d"], dtype=torch.float32), - size_2d=torch.as_tensor(targets["size_2d"], dtype=torch.float32), - size_3d=torch.as_tensor(targets["size_3d"], dtype=torch.float32), - depth=torch.as_tensor(targets["depth"], dtype=torch.float32), - heading_angle=torch.as_tensor( - np.concatenate([targets["heading_bin"], targets["heading_res"]], axis=1), - dtype=torch.float32, - ), + boxes=np.zeros((self.max_objects, 4), dtype=np.float32), + labels=np.zeros((self.max_objects), dtype=np.int8), + calib_matrix=calib, + boxes_3d=np.zeros((self.max_objects, 6), dtype=np.float32), + size_2d=np.zeros((self.max_objects, 2), dtype=np.float32), + size_3d=np.zeros((self.max_objects, 3), dtype=np.float32), + depth=np.zeros((self.max_objects, 1), dtype=np.float32), + heading_angle=np.zeros((self.max_objects, 2), dtype=np.float32), original_kitti_format=original_kitti_format, ) + return self._apply_transforms(entity) + @property def collate_fn(self) -> Callable: """Collection function to collect DetDataEntity into DetBatchDataEntity in data loader.""" return partial(Det3DBatchDataEntity.collate_fn, stack_images=self.stack_images) - def _decode_item(self, img: PILImage, annotations: list[Bbox], calib: Calibration) -> tuple: # noqa: C901 - """Decode item for training.""" - # data augmentation for image - img_size = np.array(img.size) - bbox2d = np.array([ann.points for ann in annotations]) - center = img_size / 2 - crop_size, crop_scale = img_size, 1 - random_flip_flag = False - # TODO(Kirill): add data augmentation for 3d, remove them from here. - if self.subset_type == "train": - if np.random.random() < 0.5: - random_flip_flag = True - img = img.transpose(PILImage.FLIP_LEFT_RIGHT) - - if np.random.random() < 0.5: - scale = 0.05 - shift = 0.05 - crop_scale = np.clip(np.random.randn() * scale + 1, 1 - scale, 1 + scale) - crop_size = img_size * crop_scale - center[0] += img_size[0] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift) - center[1] += img_size[1] * np.clip(np.random.randn() * shift, -2 * shift, 2 * shift) - - # add affine transformation for 2d images. - trans, trans_inv = get_affine_transform(center, crop_size, 0, self.resolution, inv=1) - img = img.transform( - tuple(self.resolution.tolist()), - method=PILImage.AFFINE, - data=tuple(trans_inv.reshape(-1).tolist()), - resample=PILImage.BILINEAR, - ) - img = np.array(img).astype(np.float32) - img = img.transpose(2, 0, 1) # C * H * W -> (384 * 1280) - # ============================ get labels ============================== - # data augmentation for labels - annotations_list: list[dict[str, Any]] = [ann.attributes for ann in annotations] - for i, obj in enumerate(annotations_list): - obj["label"] = annotations[i].label - obj["location"] = np.array(obj["location"]) - - if random_flip_flag: - for i in range(bbox2d.shape[0]): - [x1, _, x2, _] = bbox2d[i] - bbox2d[i][0], bbox2d[i][2] = img_size[0] - x2, img_size[0] - x1 - annotations_list[i]["alpha"] = np.pi - annotations_list[i]["alpha"] - annotations_list[i]["rotation_y"] = np.pi - annotations_list[i]["rotation_y"] - if annotations_list[i]["alpha"] > np.pi: - annotations_list[i]["alpha"] -= 2 * np.pi # check range - if annotations_list[i]["alpha"] < -np.pi: - annotations_list[i]["alpha"] += 2 * np.pi - if annotations_list[i]["rotation_y"] > np.pi: - annotations_list[i]["rotation_y"] -= 2 * np.pi - if annotations_list[i]["rotation_y"] < -np.pi: - annotations_list[i]["rotation_y"] += 2 * np.pi - - # labels encoding - mask_2d = np.zeros((self.max_objects), dtype=bool) - labels = np.zeros((self.max_objects), dtype=np.int8) - depth = np.zeros((self.max_objects, 1), dtype=np.float32) - heading_bin = np.zeros((self.max_objects, 1), dtype=np.int64) - heading_res = np.zeros((self.max_objects, 1), dtype=np.float32) - size_2d = np.zeros((self.max_objects, 2), dtype=np.float32) - size_3d = np.zeros((self.max_objects, 3), dtype=np.float32) - src_size_3d = np.zeros((self.max_objects, 3), dtype=np.float32) - boxes = np.zeros((self.max_objects, 4), dtype=np.float32) - boxes_3d = np.zeros((self.max_objects, 6), dtype=np.float32) - - object_num = len(annotations) if len(annotations) < self.max_objects else self.max_objects - for i in range(object_num): - cur_obj = annotations_list[i] - # ignore the samples beyond the threshold [hard encoding] - if cur_obj["location"][-1] > self.depth_threshold and cur_obj["location"][-1] < 2: - continue - - # process 2d bbox & get 2d center - bbox_2d = bbox2d[i].copy() - - # add affine transformation for 2d boxes. - bbox_2d[:2] = affine_transform(bbox_2d[:2], trans) - bbox_2d[2:] = affine_transform(bbox_2d[2:], trans) - - # process 3d center - center_2d = np.array( - [(bbox_2d[0] + bbox_2d[2]) / 2, (bbox_2d[1] + bbox_2d[3]) / 2], - dtype=np.float32, - ) # W * H - corner_2d = bbox_2d.copy() - - center_3d = np.array( - cur_obj["location"] - + [ - 0, - -cur_obj["dimensions"][0] / 2, - 0, - ], - ) # real 3D center in 3D space - center_3d = center_3d.reshape(-1, 3) # shape adjustment (N, 3) - center_3d, _ = calib.rect_to_img(center_3d) # project 3D center to image plane - center_3d = center_3d[0] # shape adjustment - if random_flip_flag: # random flip for center3d - center_3d[0] = img_size[0] - center_3d[0] - center_3d = affine_transform(center_3d.reshape(-1), trans) - - # filter 3d center out of img - proj_inside_img = True - - if center_3d[0] < 0 or center_3d[0] >= self.resolution[0]: - proj_inside_img = False - if center_3d[1] < 0 or center_3d[1] >= self.resolution[1]: - proj_inside_img = False - - if not proj_inside_img: - continue - - # class - labels[i] = cur_obj["label"] - - # encoding 2d/3d boxes - w, h = bbox_2d[2] - bbox_2d[0], bbox_2d[3] - bbox_2d[1] - size_2d[i] = 1.0 * w, 1.0 * h - - center_2d_norm = center_2d / self.resolution - size_2d_norm = size_2d[i] / self.resolution - - corner_2d_norm = corner_2d - corner_2d_norm[0:2] = corner_2d[0:2] / self.resolution - corner_2d_norm[2:4] = corner_2d[2:4] / self.resolution - center_3d_norm = center_3d / self.resolution - - k, r = center_3d_norm[0] - corner_2d_norm[0], corner_2d_norm[2] - center_3d_norm[0] - t, b = center_3d_norm[1] - corner_2d_norm[1], corner_2d_norm[3] - center_3d_norm[1] - - if k < 0 or r < 0 or t < 0 or b < 0: - continue - - boxes[i] = center_2d_norm[0], center_2d_norm[1], size_2d_norm[0], size_2d_norm[1] - boxes_3d[i] = center_3d_norm[0], center_3d_norm[1], k, r, t, b - - # encoding depth - depth[i] = cur_obj["location"][-1] * crop_scale - - # encoding heading angle - heading_angle = calib.ry2alpha(cur_obj["rotation_y"], (bbox2d[i][0] + bbox2d[i][2]) / 2) - if heading_angle > np.pi: - heading_angle -= 2 * np.pi # check range - if heading_angle < -np.pi: - heading_angle += 2 * np.pi - heading_bin[i], heading_res[i] = angle2class(heading_angle) - - # encoding size_3d - src_size_3d[i] = np.array([cur_obj["dimensions"]], dtype=np.float32) - size_3d[i] = src_size_3d[i] - - # filter out the samples with truncated or occluded - if cur_obj["truncated"] <= 0.5 and cur_obj["occluded"] <= 2: - mask_2d[i] = 1 - - # collect return data - targets_for_train = { - "labels": labels[mask_2d], - "boxes": boxes[mask_2d], - "boxes_3d": boxes_3d[mask_2d], - "depth": depth[mask_2d], - "size_2d": size_2d[mask_2d], - "size_3d": size_3d[mask_2d], - "heading_bin": heading_bin[mask_2d], - "heading_res": heading_res[mask_2d], - } - - return img, targets_for_train, img_size - - def _reformate_for_kitti_metric(self, annotations: dict[str, Any]) -> dict[str, np.array]: + def _reformate_for_kitti_metric(self, annotations: list[Any]) -> dict[str, np.array]: """Reformat the annotation for KITTI metric.""" return { "name": np.array([obj["name"] for obj in annotations]), @@ -305,3 +114,13 @@ def _reformate_for_kitti_metric(self, annotations: dict[str, Any]) -> dict[str, "occluded": np.array([obj["occluded"] for obj in annotations]), "truncated": np.array([obj["truncated"] for obj in annotations]), } + + @staticmethod + def get_calib_from_file(calib_file: str) -> np.ndarray: + """Get calibration matrix from txt file (KITTI format).""" + with open(calib_file) as f: # noqa: PTH123 + lines = f.readlines() + + obj = lines[2].strip().split(" ")[1:] + + return np.array(obj, dtype=np.float32).reshape(3, 4) diff --git a/src/otx/core/data/dataset/segmentation.py b/src/otx/core/data/dataset/segmentation.py index c0a976b8e40..90cb166c3c3 100644 --- a/src/otx/core/data/dataset/segmentation.py +++ b/src/otx/core/data/dataset/segmentation.py @@ -99,7 +99,7 @@ def _extract_class_mask(item: DatasetItem, img_shape: tuple[int, int], ignore_in msg = "It is not currently support an ignore index which is more than 255." raise ValueError(msg, ignore_index) - # fill mask with background label if we have Polygon/Ellipse annotations + # fill mask with background label if we have Polygon/Ellipse/Bbox annotations fill_value = 0 if isinstance(item.annotations[0], (Ellipse, Polygon, Bbox, RotatedBbox)) else ignore_index class_mask = np.full(shape=img_shape[:2], fill_value=fill_value, dtype=np.uint8) @@ -180,9 +180,9 @@ def __init__( to_tv_image, ) - if self.has_polygons and "background" not in [label_name.lower() for label_name in self.label_info.label_names]: + if self.has_polygons: # insert background class at index 0 since polygons represent only objects - self.label_info.label_names.insert(0, "background") + self.label_info.label_names.insert(0, "otx_background_lbl") self.label_info = SegLabelInfo( label_names=self.label_info.label_names, diff --git a/src/otx/core/data/dataset/utils/kitti_utils.py b/src/otx/core/data/dataset/utils/kitti_utils.py deleted file mode 100644 index 1ee16c41733..00000000000 --- a/src/otx/core/data/dataset/utils/kitti_utils.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# -"""Module defines utils for KITTI Dataset.""" - -# flake8: noqa -# mypy: ignore-errors - -import cv2 -import numpy as np - - -def get_calib_from_file(calib_file): - with open(calib_file) as f: - lines = f.readlines() - - obj = lines[2].strip().split(" ")[1:] - P2 = np.array(obj, dtype=np.float32) - obj = lines[3].strip().split(" ")[1:] - P3 = np.array(obj, dtype=np.float32) - obj = lines[4].strip().split(" ")[1:] - R0 = np.array(obj, dtype=np.float32) - obj = lines[5].strip().split(" ")[1:] - Tr_velo_to_cam = np.array(obj, dtype=np.float32) - - return { - "P2": P2.reshape(3, 4), - "P3": P3.reshape(3, 4), - "R0": R0.reshape(3, 3), - "Tr_velo2cam": Tr_velo_to_cam.reshape(3, 4), - } - - -class Calibration: - def __init__(self, calib_file): - if isinstance(calib_file, str): - calib = get_calib_from_file(calib_file) - else: - calib = calib_file - - self.P2 = calib["P2"] # 3 x 4 - self.R0 = calib["R0"] # 3 x 3 - self.V2C = calib["Tr_velo2cam"] # 3 x 4 - self.C2V = self.inverse_rigid_trans(self.V2C) - - # Camera intrinsics and extrinsics - self.cu = self.P2[0, 2] - self.cv = self.P2[1, 2] - self.fu = self.P2[0, 0] - self.fv = self.P2[1, 1] - self.tx = self.P2[0, 3] / (-self.fu) - self.ty = self.P2[1, 3] / (-self.fv) - - def cart_to_hom(self, pts): - """:param pts: (N, 3 or 2) - :return pts_hom: (N, 4 or 3) - """ - pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32))) - return pts_hom - - def lidar_to_rect(self, pts_lidar): - """:param pts_lidar: (N, 3) - :return pts_rect: (N, 3) - """ - pts_lidar_hom = self.cart_to_hom(pts_lidar) - pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T)) - # pts_rect = reduce(np.dot, (pts_lidar_hom, self.V2C.T, self.R0.T)) - return pts_rect - - def rect_to_lidar(self, pts_rect): - pts_ref = np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_rect))) - pts_ref = self.cart_to_hom(pts_ref) # nx4 - return np.dot(pts_ref, np.transpose(self.C2V)) - - def rect_to_img(self, pts_rect): - """:param pts_rect: (N, 3) - :return pts_img: (N, 2) - """ - pts_rect_hom = self.cart_to_hom(pts_rect) - pts_2d_hom = np.dot(pts_rect_hom, self.P2.T) - pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T # (N, 2) - pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2] # depth in rect camera coord - return pts_img, pts_rect_depth - - def lidar_to_img(self, pts_lidar): - """:param pts_lidar: (N, 3) - :return pts_img: (N, 2) - """ - pts_rect = self.lidar_to_rect(pts_lidar) - pts_img, pts_depth = self.rect_to_img(pts_rect) - return pts_img, pts_depth - - def img_to_rect(self, u, v, depth_rect): - """:param u: (N) - :param v: (N) - :param depth_rect: (N) - :return: - """ - x = ((u - self.cu) * depth_rect) / self.fu + self.tx - y = ((v - self.cv) * depth_rect) / self.fv + self.ty - pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1) - return pts_rect - - def depthmap_to_rect(self, depth_map): - """:param depth_map: (H, W), depth_map - :return: - """ - x_range = np.arange(0, depth_map.shape[1]) - y_range = np.arange(0, depth_map.shape[0]) - x_idxs, y_idxs = np.meshgrid(x_range, y_range) - x_idxs, y_idxs = x_idxs.reshape(-1), y_idxs.reshape(-1) - depth = depth_map[y_idxs, x_idxs] - pts_rect = self.img_to_rect(x_idxs, y_idxs, depth) - return pts_rect, x_idxs, y_idxs - - def corners3d_to_img_boxes(self, corners3d): - """:param corners3d: (N, 8, 3) corners in rect coordinate - :return: boxes: (None, 4) [x1, y1, x2, y2] in rgb coordinate - :return: boxes_corner: (None, 8) [xi, yi] in rgb coordinate - """ - sample_num = corners3d.shape[0] - corners3d_hom = np.concatenate((corners3d, np.ones((sample_num, 8, 1))), axis=2) # (N, 8, 4) - - img_pts = np.matmul(corners3d_hom, self.P2.T) # (N, 8, 3) - - x, y = img_pts[:, :, 0] / img_pts[:, :, 2], img_pts[:, :, 1] / img_pts[:, :, 2] - x1, y1 = np.min(x, axis=1), np.min(y, axis=1) - x2, y2 = np.max(x, axis=1), np.max(y, axis=1) - - boxes = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1), x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1) - boxes_corner = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1)), axis=2) - - return boxes, boxes_corner - - def camera_dis_to_rect(self, u, v, d): - """Can only process valid u, v, d, which means u, v can not beyond the image shape, reprojection error 0.02 - :param u: (N) - :param v: (N) - :param d: (N), the distance between camera and 3d points, d^2 = x^2 + y^2 + z^2 - :return: - """ - assert self.fu == self.fv, "%.8f != %.8f" % (self.fu, self.fv) - fd = np.sqrt((u - self.cu) ** 2 + (v - self.cv) ** 2 + self.fu**2) - x = ((u - self.cu) * d) / fd + self.tx - y = ((v - self.cv) * d) / fd + self.ty - z = np.sqrt(d**2 - x**2 - y**2) - pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)), axis=1) - return pts_rect - - def inverse_rigid_trans(self, Tr): - """Inverse a rigid body transform matrix (3x4 as [R|t]) - [R'|-R't; 0|1] - """ - inv_Tr = np.zeros_like(Tr) # 3x4 - inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3]) - inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3]) - return inv_Tr - - def alpha2ry(self, alpha, u): - """Get rotation_y by alpha + theta - 180 - alpha : Observation angle of object, ranging [-pi..pi] - x : Object center x to the camera center (x-W/2), in pixels - rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi] - """ - ry = alpha + np.arctan2(u - self.cu, self.fu) - - if ry > np.pi: - ry -= 2 * np.pi - if ry < -np.pi: - ry += 2 * np.pi - - return ry - - def ry2alpha(self, ry, u): - alpha = ry - np.arctan2(u - self.cu, self.fu) - - if alpha > np.pi: - alpha -= 2 * np.pi - if alpha < -np.pi: - alpha += 2 * np.pi - - return alpha - - def flip(self, img_size): - wsize = 4 - hsize = 2 - p2ds = ( - np.concatenate( - [ - np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[0], wsize), 0), [hsize, 1]), -1), - np.expand_dims(np.tile(np.expand_dims(np.linspace(0, img_size[1], hsize), 1), [1, wsize]), -1), - np.linspace(2, 78, wsize * hsize).reshape(hsize, wsize, 1), - ], - -1, - ) - ).reshape(-1, 3) - p3ds = self.img_to_rect(p2ds[:, 0:1], p2ds[:, 1:2], p2ds[:, 2:3]) - p3ds[:, 0] *= -1 - p2ds[:, 0] = img_size[0] - p2ds[:, 0] - - # self.P2[0,3] *= -1 - cos_matrix = np.zeros([wsize * hsize, 2, 7]) - cos_matrix[:, 0, 0] = p3ds[:, 0] - cos_matrix[:, 0, 1] = cos_matrix[:, 1, 2] = p3ds[:, 2] - cos_matrix[:, 1, 0] = p3ds[:, 1] - cos_matrix[:, 0, 3] = cos_matrix[:, 1, 4] = 1 - cos_matrix[:, :, -2] = -p2ds[:, :2] - cos_matrix[:, :, -1] = -p2ds[:, :2] * p3ds[:, 2:3] - new_calib = np.linalg.svd(cos_matrix.reshape(-1, 7))[-1][-1] - new_calib /= new_calib[-1] - - new_calib_matrix = np.zeros([4, 3]).astype(np.float32) - new_calib_matrix[0, 0] = new_calib_matrix[1, 1] = new_calib[0] - new_calib_matrix[2, 0:2] = new_calib[1:3] - new_calib_matrix[3, :] = new_calib[3:6] - new_calib_matrix[-1, -1] = self.P2[-1, -1] - self.P2 = new_calib_matrix.T - self.cu = self.P2[0, 2] - self.cv = self.P2[1, 2] - self.fu = self.P2[0, 0] - self.fv = self.P2[1, 1] - self.tx = self.P2[0, 3] / (-self.fu) - self.ty = self.P2[1, 3] / (-self.fv) - - -def get_dir(src_point, rot_rad): - sn, cs = np.sin(rot_rad), np.cos(rot_rad) - - src_result = [0, 0] - src_result[0] = src_point[0] * cs - src_point[1] * sn - src_result[1] = src_point[0] * sn + src_point[1] * cs - - return src_result - - -def get_3rd_point(a, b): - direct = a - b - return b + np.array([-direct[1], direct[0]], dtype=np.float32) - - -def get_affine_transform(center, scale, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0): - if not isinstance(scale, np.ndarray) and not isinstance(scale, list): - scale = np.array([scale, scale], dtype=np.float32) - - scale_tmp = scale - src_w = scale_tmp[0] - dst_w = output_size[0] - dst_h = output_size[1] - - rot_rad = np.pi * rot / 180 - src_dir = get_dir([0, src_w * -0.5], rot_rad) - dst_dir = np.array([0, dst_w * -0.5], np.float32) - - src = np.zeros((3, 2), dtype=np.float32) - dst = np.zeros((3, 2), dtype=np.float32) - src[0, :] = center + scale_tmp * shift - src[1, :] = center + src_dir + scale_tmp * shift - dst[0, :] = [dst_w * 0.5, dst_h * 0.5] - dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir - - src[2:, :] = get_3rd_point(src[0, :], src[1, :]) - dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) - - if inv: - trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) - trans_inv = cv2.getAffineTransform(np.float32(dst), np.float32(src)) - return trans, trans_inv - else: - trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) - return trans - - -def affine_transform(pt, t): - new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T - new_pt = np.dot(t, new_pt) - return new_pt[:2] - - -def angle2class(angle): - """Convert continuous angle to discrete class and residual.""" - num_heading_bin = 12 - angle = angle % (2 * np.pi) - assert angle >= 0 and angle <= 2 * np.pi - angle_per_class = 2 * np.pi / float(num_heading_bin) - shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) - class_id = int(shifted_angle / angle_per_class) - residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) - return class_id, residual_angle - - -def class2angle(cls: int, residual: float, to_label_format: bool = False) -> float: - """Inverse function to angle2class.""" - num_heading_bin = 12 - angle_per_class = 2 * np.pi / float(num_heading_bin) - angle_center = cls * angle_per_class - angle = angle_center + residual - if to_label_format and angle > np.pi: - angle = angle - 2 * np.pi - return angle diff --git a/src/otx/core/data/entity/object_detection_3d.py b/src/otx/core/data/entity/object_detection_3d.py index 564ea283a60..17d5d2687b8 100644 --- a/src/otx/core/data/entity/object_detection_3d.py +++ b/src/otx/core/data/entity/object_detection_3d.py @@ -20,17 +20,25 @@ from otx.core.types.task import OTXTaskType if TYPE_CHECKING: + from numpy import ndarray from torch import LongTensor, Tensor @register_pytree_node @dataclass class Det3DDataEntity(OTXDataEntity): - """Data entity for detection task. + """Data entity for 3d object detection task. + + : param boxes (tv_tensors.BoundingBoxes): The bounding boxes for the objects in the image. + : param calib_matrix (Tensor): The calibration matrix for the 3D object detection. + : param boxes_3d (Tensor): The 3D bounding boxes for the objects. + : param size_2d (Tensor): The 2D size of the objects. + : param size_3d (Tensor): The 3D size of the objects. + : param depth (Tensor): The depth of the objects. + : param heading_angle (Tensor): The heading angle of the objects. + : param labels (LongTensor): The labels of the objects. + : param original_kitti_format (list[dict[str, Any]] | None): The original KITTI format of the objects, if available. - :param bboxes: Bbox annotations as top-left-bottom-right - (x1, y1, x2, y2) format with absolute coordinate values - :param labels: Bbox labels as integer indices """ @property @@ -38,30 +46,37 @@ def task(self) -> OTXTaskType: """OTX Task type definition.""" return OTXTaskType.OBJECT_DETECTION_3D - boxes: tv_tensors.BoundingBoxes - calib_matrix: Tensor - boxes_3d: Tensor - size_2d: Tensor - size_3d: Tensor - depth: Tensor - heading_angle: Tensor - labels: LongTensor - original_kitti_format: list[dict[str, Any]] | None + boxes: tv_tensors.BoundingBoxes | ndarray + calib_matrix: Tensor | ndarray + boxes_3d: Tensor | ndarray + size_2d: Tensor | ndarray + size_3d: Tensor | ndarray + depth: Tensor | ndarray + heading_angle: Tensor | ndarray + labels: LongTensor | ndarray + original_kitti_format: dict[str, Any] | None @dataclass class Det3DPredEntity(OTXPredEntity, Det3DDataEntity): - """Data entity to represent the detection model output prediction.""" + """Data entity to represent the 3d object detection model output prediction.""" @dataclass class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]): - """Data entity for detection task. - - :param bboxes: A list of bbox annotations as top-left-bottom-right - (x1, y1, x2, y2) format with absolute coordinate values - :param labels: A list of bbox labels as integer indices - """ # TODO(Kirill): UPDATE! + """Data entity for 3d object detection task. + + : param boxes list[tv_tensors.BoundingBoxes]: The bounding boxes for the objects in the image. + : param calib_matrix list[Tensor]: The calibration matrix for the 3D object detection. + : param boxes_3d list[Tensor]: The 3D bounding boxes for the objects. + : param size_2d list[Tensor]: The 2D size of the objects. + : param size_3d list[Tensor]: The 3D size of the objects. + : param depth list[Tensor]: The depth of the objects. + : param heading_angle list[Tensor]: The heading angle of the objects. + : param labels list[LongTensor]: The labels of the objects. + : param original_kitti_format list[list[dict[str, Any]] | None]: The original KITTI format of the objects, + if available. Needed for validation and KITTI metric. + """ images: Tensor boxes: list[tv_tensors.BoundingBoxes] @@ -72,7 +87,7 @@ class Det3DBatchDataEntity(OTXBatchDataEntity[Det3DDataEntity]): depth: list[Tensor] heading_angle: list[Tensor] labels: list[LongTensor] - original_kitti_format: list[list[dict[str, Any]] | None] + original_kitti_format: list[dict[str, Any] | None] @property def task(self) -> OTXTaskType: @@ -135,7 +150,7 @@ def pin_memory(self) -> Det3DBatchDataEntity: @dataclass class Det3DBatchPredEntity(OTXBatchPredEntity, Det3DBatchDataEntity): - """Data entity to represent model output predictions for detection task.""" + """Data entity to represent model output predictions for 3d object detection task.""" boxes: tv_tensors.BoundingBoxes scores: Tensor diff --git a/src/otx/core/data/entity/utils.py b/src/otx/core/data/entity/utils.py index 446f91eeee0..fe876d54672 100644 --- a/src/otx/core/data/entity/utils.py +++ b/src/otx/core/data/entity/utils.py @@ -34,7 +34,7 @@ class MulticlassClsDataEntity(OTXDataEntity): """ flatten_fn = lambda obj: (list(obj.values()), list(obj.keys())) unflatten_fn = lambda values, context: cls(**dict(zip(context, values))) - pytree._register_pytree_node( # noqa: SLF001 + pytree.register_pytree_node( cls, flatten_fn=flatten_fn, unflatten_fn=unflatten_fn, diff --git a/src/otx/core/data/pre_filtering.py b/src/otx/core/data/pre_filtering.py index a3924c86752..13fc08c7ebc 100644 --- a/src/otx/core/data/pre_filtering.py +++ b/src/otx/core/data/pre_filtering.py @@ -105,5 +105,4 @@ def remove_unused_labels( mapping = {original_categories[idx]: original_categories[idx] for idx in used_labels} msg = "There are unused labels in dataset, they will be filtered out before training." warnings.warn(msg, stacklevel=2) - return dataset.transform("remap_labels", mapping=mapping, default="delete") diff --git a/src/otx/core/data/transform_libs/torchvision.py b/src/otx/core/data/transform_libs/torchvision.py index 1c77ca2eb8e..1a6da801156 100644 --- a/src/otx/core/data/transform_libs/torchvision.py +++ b/src/otx/core/data/transform_libs/torchvision.py @@ -3406,7 +3406,7 @@ def _get_warp_image( ) -> torch.Tensor: numpy_image: np.ndarray = to_np_image(image) warped_image = cv2.warpAffine(numpy_image, warp_mat, warp_size, flags=cv2.INTER_LINEAR) - return torch.from_numpy(warped_image).permute(2, 0, 1) + return torch.from_numpy(warped_image).to(dtype=torch.float32).permute(2, 0, 1) def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None: """Transform function to affine image through warp matrix.""" @@ -3449,6 +3449,335 @@ def __repr__(self) -> str: return repr_str +class Decode3DInputsAffineTransforms(TopdownAffine): + """Transform function for 3D Object Detection to affine image through warp matrix. + + This transform decode the input annotations and apply affine transforms. + + Args: + input_size (tuple[int, int]): Input image size. + random_horizontal_flip (bool): Randomly flip the image horizontally. + random_crop (bool): Randomly crop the image. + decode_annotations (bool): Whether to decode the annotations. + p_crop (float): Probability of cropping. + random_scale (float): Randomly scale the image. + random_shift (float): Randomly shift the image. + depth_threshold (int): Threshold of depth. + max_objects (int): Maximum number of objects. + """ + + def __init__( + self, + input_size: tuple[int, int] | None = None, # (H, W), + random_horizontal_flip: bool = False, + random_crop: bool = False, + decode_annotations: bool = True, + p_crop: float = 0.5, + p_flip: float = 0.5, + random_scale: float = 0.05, + random_shift: float = 0.05, + depth_threshold: int = 65, + max_objects: int = 50, + ) -> None: + self.input_size = input_size # type: ignore[assignment] + self.random_horizontal_flip = random_horizontal_flip + self.random_crop = random_crop + self.decode_annotations = decode_annotations + self.p_crop = p_crop + self.p_flip = p_flip + self.random_scale = random_scale + self.random_shift = random_shift + self.depth_threshold = depth_threshold + self.max_objects = max_objects + + def _affine_transforms( + self, + image: np.ndarray, + ori_img_size: np.ndarray, + warp_size: tuple[int, int], + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, bool]: + """Get affine matrix and warp image. + + Args: + image (np.ndarray): Input image. + ori_img_size (np.ndarray): Original image size. + warp_size (tuple[int, int]): Output image size. + + Returns: + tuple[np.ndarray, np.ndarray, np.ndarray, bool]: + Affine matrix, warped image, and random flip flag. + """ + center = ori_img_size / 2 + crop_size, crop_scale = ori_img_size, 1 + random_flip_flag = False + if self.random_crop and (np.random.random() <= self.p_crop): + crop_scale = np.clip( + np.random.randn() * self.random_scale + 1, + 1 - self.random_scale, + 1 + self.random_scale, + ) + crop_size = ori_img_size * crop_scale + center[0] += ori_img_size[0] * np.clip( + np.random.randn() * self.random_shift, + -2 * self.random_shift, + 2 * self.random_shift, + ) + center[1] += ori_img_size[1] * np.clip( + np.random.randn() * self.random_shift, + -2 * self.random_shift, + 2 * self.random_shift, + ) + + if self.random_horizontal_flip and (np.random.random() <= self.p_flip): + random_flip_flag = True + image = np.fliplr(image) + + trans = self._get_warp_matrix(center, crop_size, 0, warp_size) + return self._get_warp_image(image, trans, warp_size), crop_scale, trans, random_flip_flag + + def __call__(self, *_inputs: T_OTXDataEntity) -> T_OTXDataEntity | None: + """Transform __call__ function to affine image through warp matrix.""" + inputs = _inputs[0] + ori_img_size = np.array(inputs.img_info.ori_shape)[::-1] + # labels encoding + src_size_3d = np.zeros((self.max_objects, 3), dtype=np.float32) + mask_2d = np.zeros((self.max_objects), dtype=bool) + if self.input_size is None: + # No need to resize (OV IR) + inputs.img_info.img_shape = ori_img_size + return self.convert(inputs, mask_2d, image_to_tensor=True) + + annotations_list = inputs.original_kitti_format + h, w = self.input_size + warp_size = (int(w), int(h)) + # transform image + inputs.image, crop_scale, trans, random_flip_flag = self._affine_transforms( + inputs.image, + ori_img_size, + warp_size, + ) + + if not self.decode_annotations: + # resize only (val/test) + inputs.img_info.img_shape = self.input_size + return self.convert(inputs, mask_2d) + + # decode annotations + if random_flip_flag: + for i in range(len(annotations_list["bbox"])): + [x1, _, x2, _] = annotations_list["bbox"][i] + annotations_list["bbox"][i][0], annotations_list["bbox"][i][2] = ( + ori_img_size[0] - x2, + ori_img_size[0] - x1, + ) + annotations_list["alpha"][i] = np.pi - annotations_list["alpha"][i] + annotations_list["rotation_y"][i] = np.pi - annotations_list["rotation_y"][i] + if annotations_list["alpha"][i] > np.pi: + annotations_list["alpha"][i] -= 2 * np.pi # check range + if annotations_list["alpha"][i] < -np.pi: + annotations_list["alpha"][i] += 2 * np.pi + if annotations_list["rotation_y"][i] > np.pi: + annotations_list["rotation_y"][i] -= 2 * np.pi + if annotations_list["rotation_y"][i] < -np.pi: + annotations_list["rotation_y"][i] += 2 * np.pi + + object_num = ( + len(annotations_list["bbox"]) if len(annotations_list["bbox"]) < self.max_objects else self.max_objects + ) + for i in range(object_num): + # ignore the samples beyond the threshold [hard encoding] + if annotations_list["location"][i][-1] > self.depth_threshold and annotations_list["location"][i][-1] < 2: + continue + + # process 2d bbox & get 2d center + bbox_2d = annotations_list["bbox"][i].copy() + + # add affine transformation for 2d boxes. + bbox_2d[:2] = self.affine_transform(bbox_2d[:2], trans) + bbox_2d[2:] = self.affine_transform(bbox_2d[2:], trans) + + # process 3d center + center_2d = np.array( + [(bbox_2d[0] + bbox_2d[2]) / 2, (bbox_2d[1] + bbox_2d[3]) / 2], + dtype=np.float32, + ) # W * H + corner_2d = bbox_2d.copy() + + center_3d = np.array( + annotations_list["location"][i] + + [ + 0, + -annotations_list["dimensions"][i][1] / 2, + 0, + ], + ) # real 3D center in 3D space + center_3d = center_3d.reshape(-1, 3) # shape adjustment (N, 3) + center_3d, _ = self.rect_to_img(inputs.calib_matrix, center_3d) # project 3D center to image plane + center_3d = center_3d[0] # shape adjustment + if random_flip_flag: # random flip for center3d + center_3d[0] = ori_img_size[0] - center_3d[0] + center_3d = self.affine_transform(center_3d.reshape(-1), trans) + + # filter 3d center out of img + proj_inside_img = True + + if center_3d[0] < 0 or center_3d[0] >= warp_size[0]: + proj_inside_img = False + if center_3d[1] < 0 or center_3d[1] >= warp_size[1]: + proj_inside_img = False + + if not proj_inside_img: + continue + + # class + inputs.labels[i] = annotations_list["name"][i] + + # encoding 2d/3d boxes + w, h = bbox_2d[2] - bbox_2d[0], bbox_2d[3] - bbox_2d[1] + inputs.size_2d[i] = 1.0 * w, 1.0 * h + + center_2d_norm = center_2d / warp_size + size_2d_norm = inputs.size_2d[i] / warp_size + + corner_2d_norm = corner_2d + corner_2d_norm[0:2] = corner_2d[0:2] / warp_size + corner_2d_norm[2:4] = corner_2d[2:4] / warp_size + center_3d_norm = center_3d / warp_size + + k, r = center_3d_norm[0] - corner_2d_norm[0], corner_2d_norm[2] - center_3d_norm[0] + t, b = center_3d_norm[1] - corner_2d_norm[1], corner_2d_norm[3] - center_3d_norm[1] + + if k < 0 or r < 0 or t < 0 or b < 0: + continue + + inputs.boxes[i] = center_2d_norm[0], center_2d_norm[1], size_2d_norm[0], size_2d_norm[1] + inputs.boxes_3d[i] = center_3d_norm[0], center_3d_norm[1], k, r, t, b + + # encoding depth + inputs.depth[i] = annotations_list["location"][i][-1] * crop_scale + + # encoding heading angle + heading_angle = self.ry2alpha( + inputs.calib_matrix, + annotations_list["rotation_y"][i], + (annotations_list["bbox"][i][0] + annotations_list["bbox"][i][2]) / 2, + ) + if heading_angle > np.pi: + heading_angle -= 2 * np.pi # check range + if heading_angle < -np.pi: + heading_angle += 2 * np.pi + inputs.heading_angle[i] = self.angle2class(heading_angle) + + # encoding size_3d + src_size_3d[i] = np.array( + [ + annotations_list["dimensions"][i][1], + annotations_list["dimensions"][i][2], + annotations_list["dimensions"][i][0], + ], + dtype=np.float32, + ) + inputs.size_3d[i] = src_size_3d[i] + + # filter out the samples with truncated or occluded + if annotations_list["truncated"][i] <= 0.5 and annotations_list["occluded"][i] <= 2: + mask_2d[i] = 1 + + # update img_info + inputs.img_info.img_shape = self.input_size + + return self.convert(inputs, mask_2d) + + @staticmethod + def affine_transform(pt: np.ndarray, t: np.ndarray) -> np.ndarray: + """Apply an affine transformation to the points.""" + new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + @staticmethod + def rect_to_img(p2: np.ndarray, pts_rect: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Convert camera coordinates to image coordinates. + + Args: + p2 (np.ndarray): Projection matrix with shape (3, 4). + pts_rect (np.ndarray): Rectangular coordinates with shape (N, 4). + + Returns: + tuple[np.ndarray, np.ndarray]: Image coordinates with shape (N, 2). + """ + + def cart_to_hom(pts: np.ndarray) -> np.ndarray: + """Convert Cartesian coordinates to homogeneous coordinates. + + Args: + pts (np.ndarray): Array of Cartesian coordinates with shape (N, D), + where N is the number of points and D is the number of dimensions. + + Returns: + np.ndarray: Array of homogeneous coordinates with shape (N, D+1), + where N is the number of points and D is the number of dimensions. + """ + return np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32))) + + pts_rect_hom = cart_to_hom(pts_rect) + pts_2d_hom = np.dot(pts_rect_hom, p2.T) + pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T # (N, 2) + pts_rect_depth = pts_2d_hom[:, 2] - p2.T[3, 2] # depth in rect camera coord + return pts_img, pts_rect_depth + + @staticmethod + def ry2alpha(p2: np.ndarray, ry: np.ndarray, u: np.ndarray) -> np.ndarray: + """Get observation angle of object. + + Args: + p2 (np.ndarray): Projection matrix with shape (3, 4). + ry (np.ndarray): Observation angle of object with shape (N, ). + u (np.ndarray): Pixel coordinates with shape (N, 2). + + Returns: + np.ndarray: Observation angle of object with shape (N, ). + """ + alpha = ry - np.arctan2(u - p2[0, 2], p2[0, 0]) + + if alpha > np.pi: + alpha -= 2 * np.pi + if alpha < -np.pi: + alpha += 2 * np.pi + + return alpha + + @staticmethod + def angle2class(angle: float) -> tuple[int, float]: + """Convert continuous angle to discrete class and residual.""" + num_heading_bin = 12 + angle = angle % (2 * np.pi) + if not (angle >= 0 and angle <= 2 * np.pi): + msg = "angle not in 0 ~ 2pi" + raise ValueError(msg) + + angle_per_class = 2 * np.pi / float(num_heading_bin) + shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) + class_id = int(shifted_angle / angle_per_class) + residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) + return class_id, residual_angle + + def convert(self, inputs: T_OTXDataEntity, mask_2d: np.ndarray, image_to_tensor: bool = False) -> T_OTXDataEntity: # type: ignore[override] + """Convert the data entity to torchvision format.""" + if image_to_tensor: + inputs.image = torch.from_numpy(inputs.image).permute(2, 0, 1) + inputs.labels = torch.as_tensor(inputs.labels[mask_2d], dtype=torch.long) + inputs.boxes = tv_tensors.BoundingBoxes(inputs.boxes[mask_2d], format="XYXY", canvas_size=self.input_size) + inputs.boxes_3d = torch.as_tensor(inputs.boxes_3d[mask_2d], dtype=torch.float32) + inputs.size_2d = torch.as_tensor(inputs.size_2d[mask_2d], dtype=torch.float32) + inputs.size_3d = torch.as_tensor(inputs.size_3d[mask_2d], dtype=torch.float32) + inputs.depth = torch.as_tensor(inputs.depth[mask_2d], dtype=torch.float32) + inputs.heading_angle = torch.as_tensor(inputs.heading_angle[mask_2d], dtype=torch.float32) + inputs.calib_matrix = torch.as_tensor(inputs.calib_matrix, dtype=torch.float32) + + return inputs + + class TorchVisionTransformLib: """Helper to support TorchVision transforms (only V2) in OTX.""" diff --git a/src/otx/core/exporter/base.py b/src/otx/core/exporter/base.py index 85d77fe4799..cfbc670e58e 100644 --- a/src/otx/core/exporter/base.py +++ b/src/otx/core/exporter/base.py @@ -45,6 +45,9 @@ class OTXModelExporter: output_names (list[str] | None, optional): Names for model's outputs, which would be embedded into resulting model. Note, that order of the output names should be the same, as in the target model. + input_names (list[str] | None, optional): Names for model's inputs, which would be + embedded into resulting model. Note, that order of the input names should be the same, + as in the target model. """ def __init__( diff --git a/src/otx/core/exporter/detection_3d.py b/src/otx/core/exporter/detection_3d.py index 17b1377436a..1c33f5130ed 100644 --- a/src/otx/core/exporter/detection_3d.py +++ b/src/otx/core/exporter/detection_3d.py @@ -98,3 +98,26 @@ def to_onnx( log.info("Converting to ONNX is done.") return Path(save_path) + + def to_exportable_code( + self, + model: OTXModel, + output_dir: Path, + base_model_name: str = "exported_model", + precision: OTXPrecisionType = OTXPrecisionType.FP32, + ) -> Path: + """Export to zip folder final OV IR model with runable demo. + + NOT SUPPORTED FOR OD 3D. It will raise an error. + + Args: + model (OTXModel): OTXModel to be exported + output_dir (Path): path to the directory to store export artifacts + base_model_name (str, optional): exported model name + precision (OTXExportPrecisionType, optional): precision of the exported model's weights + + Returns: + Path: path to the exported model. + """ + msg = "Exportable code option is not supported for Object Detection 3D." + raise NotImplementedError(msg) diff --git a/src/otx/core/exporter/exportable_code/demo/requirements.txt b/src/otx/core/exporter/exportable_code/demo/requirements.txt index c816a10f57c..cb535522808 100644 --- a/src/otx/core/exporter/exportable_code/demo/requirements.txt +++ b/src/otx/core/exporter/exportable_code/demo/requirements.txt @@ -1,3 +1,3 @@ openvino==2024.4.0 -openvino-model-api==0.2.4 +openvino-model-api==0.2.5 numpy==1.26.4 diff --git a/src/otx/core/metrics/accuracy.py b/src/otx/core/metrics/accuracy.py index 7c2aeb975bf..62c61da610b 100644 --- a/src/otx/core/metrics/accuracy.py +++ b/src/otx/core/metrics/accuracy.py @@ -290,12 +290,17 @@ def __init__( ] # Multilabel classification accuracy metrics - if self.num_multilabel_classes > 0: + # https://github.com/Lightning-AI/torchmetrics/blob/6377aa5b6fe2863761839e6b8b5a857ef1b8acfa/src/torchmetrics/functional/classification/stat_scores.py#L583-L584 + # MultilabelAccuracy is available when num_multilabel_classes is greater than 2. + self.multilabel_accuracy = None + if self.num_multilabel_classes > 1: self.multilabel_accuracy = TorchmetricMultilabelAcc( num_labels=self.num_multilabel_classes, threshold=0.5, average="macro", ) + elif self.num_multilabel_classes == 1: + self.multilabel_accuracy = TorchmetricAcc(task="binary", num_classes=self.num_multilabel_classes) def _apply(self, fn: Callable, exclude_state: Sequence[str] = "") -> nn.Module: self.multiclass_head_accuracy = [ @@ -305,7 +310,7 @@ def _apply(self, fn: Callable, exclude_state: Sequence[str] = "") -> nn.Module: ) for acc in self.multiclass_head_accuracy ] - if self.num_multilabel_classes > 0: + if self.multilabel_accuracy is not None: self.multilabel_accuracy = self.multilabel_accuracy._apply(fn, exclude_state) # noqa: SLF001 return self @@ -324,7 +329,7 @@ def update(self, preds: torch.Tensor, target: torch.Tensor) -> None: target_multiclass[multiclass_mask], ) - if self.num_multilabel_classes > 0: + if self.multilabel_accuracy is not None: # Split preds into multiclass and multilabel parts preds_multilabel = preds[:, self.num_multiclass_heads :] target_multilabel = target[:, self.num_multiclass_heads :] @@ -339,7 +344,7 @@ def compute(self) -> torch.Tensor: ), ) - if self.num_multilabel_classes > 0: + if self.multilabel_accuracy is not None: multilabel_acc = self.multilabel_accuracy.compute() return (multiclass_accs + multilabel_acc) / 2 diff --git a/src/otx/core/metrics/average_precision_3d.py b/src/otx/core/metrics/average_precision_3d.py index 7b8530ba684..2600200280b 100644 --- a/src/otx/core/metrics/average_precision_3d.py +++ b/src/otx/core/metrics/average_precision_3d.py @@ -7,8 +7,10 @@ from typing import TYPE_CHECKING +import torch from torch import Tensor from torchmetrics import Metric +from torchmetrics.detection.mean_ap import MeanAveragePrecision from otx.core.metrics.kitti_3d_eval import get_coco_eval_result @@ -32,6 +34,7 @@ def __init__( super().__init__() self.label_info: LabelInfo = label_info + self.mean_ap: MeanAveragePrecision = MeanAveragePrecision(box_format="xyxy", iou_type="bbox") self.reset() def reset(self) -> None: @@ -42,6 +45,7 @@ def reset(self) -> None: super().reset() self.preds: list[dict[str, np.array]] = [] self.targets: list[dict[str, np.array]] = [] + self.mean_ap.reset() def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]]) -> None: """Update total predictions and targets from given batch predicitons and targets.""" @@ -51,13 +55,35 @@ def update(self, preds: list[dict[str, Tensor]], target: list[dict[str, Tensor]] def compute(self) -> dict: """Compute metrics for 3d object detection.""" current_classes = self.label_info.label_names - map_bbox, map_3d = get_coco_eval_result( + preds_for_torchmetrics = self.prepare_inputs_for_map_coco(self.preds) + targets_for_torchmetrics = self.prepare_inputs_for_map_coco(self.targets) + ap_bbox_coco = self.mean_ap(preds_for_torchmetrics, targets_for_torchmetrics) + ap_3d = get_coco_eval_result( self.targets, self.preds, current_classes=[curcls.lower() for curcls in current_classes], ) - # use moderate difficulty as final score. Average across all calsses. - return {"mAP_bbox_3d": Tensor([map_3d[:, 1].mean()]), "mAP_bbox_2d": Tensor([map_bbox[:, 1].mean()])} + # Average across all classes. + return { + "AP_3d@0.5": Tensor([ap_3d[0]]), + "AP_2d@0.5": ap_bbox_coco["map_50"], + "mAP_3d": Tensor([ap_3d.mean()]), + "mAP_2d": ap_bbox_coco["map"], + } + + def prepare_inputs_for_map_coco(self, targets: list[dict[str, np.array]]) -> list[dict[str, Tensor]]: + """Prepare targets for torchmetrics.""" + return [ + { + "boxes": torch.tensor(target["bbox"]), + "scores": torch.tensor(target["score"]) if "score" in target else None, + "labels": torch.tensor( + [self.label_info.label_names.index(label) for label in target["name"]], + dtype=torch.long, + ), + } + for target in targets + ] def _kitti_metric_measure_callable(label_info: LabelInfo) -> KittiMetric: diff --git a/src/otx/core/metrics/kitti_3d_eval/eval.py b/src/otx/core/metrics/kitti_3d_eval/eval.py index 951cc96538d..34144fa4797 100644 --- a/src/otx/core/metrics/kitti_3d_eval/eval.py +++ b/src/otx/core/metrics/kitti_3d_eval/eval.py @@ -3,12 +3,10 @@ # """KITTI 3D eval for OTX.""" -# flake8: noqa -# mypy: ignore-errors from __future__ import annotations -import io as sysio +import logging from typing import Any import numba @@ -21,47 +19,11 @@ from .rotate_iou import rotate_iou_eval_cpu as rotate_iou_eval -@numba.jit(nopython=True) -def get_thresholds( - scores: np.ndarray, # 1D array of confidence scores - num_gt: int, # Number of ground truth objects - num_sample_pts: int = 41, # Number of sample points used to compute recall thresholds -) -> np.ndarray: # 1D array of recall thresholds - """Compute recall thresholds for a given score array. - - Args: - scores (np.ndarray): 1D array of confidence scores. - num_gt (int): Number of ground truth objects. - num_sample_pts (int, optional): Number of sample points used to - compute recall thresholds. Defaults to 41. - - Returns: - np.ndarray: 1D array of recall thresholds. - """ - scores.sort() - scores = scores[::-1] - current_recall = 0 - thresholds = [] - for i, score in enumerate(scores): - l_recall = (i + 1) / num_gt - if i < (len(scores) - 1): - r_recall = (i + 2) / num_gt - else: - r_recall = l_recall - if ((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1)): - continue - # recall = l_recall - thresholds.append(score) - current_recall += 1 / (num_sample_pts - 1.0) - return thresholds - - def clean_data( gt_anno: dict, # ground truth annotations dt_anno: dict, # detection results current_class: str, # the current class name - difficulty: int, # the difficulty level -) -> tuple: # (num_valid_gt, ignored_gt, ignored_dt, dc_bboxes) +) -> tuple: # (num_valid_gt, ignored_gt, ignored_dt) """Filter out the objects that are not in the current class. Args: @@ -71,12 +33,12 @@ def clean_data( difficulty (int): The difficulty level. Returns: - tuple: The number of valid objects, ignored_gt, ignored_dt, and dc_bboxes. + tuple: The number of valid objects, ignored_gt, ignored_dt. """ - MIN_HEIGHT = [40, 25, 25] - MAX_OCCLUSION = [0, 1, 2] - MAX_TRUNCATION = [0.15, 0.3, 0.5] - dc_bboxes, ignored_gt, ignored_dt = [], [], [] + min_height = 20 + max_occlusion = 2 + max_truncation = 0.5 + ignored_gt, ignored_dt = [], [] num_gt = len(gt_anno["name"]) num_dt = len(dt_anno["name"]) num_valid_gt = 0 @@ -87,19 +49,18 @@ def clean_data( valid_class = -1 if gt_name == current_class: valid_class = 1 - elif current_class == "Pedestrian".lower() and "Person_sitting".lower() == gt_name: - valid_class = 0 - elif current_class == "Car".lower() and "Van".lower() == gt_name: + elif (current_class == "Pedestrian".lower() and "Person_sitting".lower() == gt_name) or ( + current_class == "Car".lower() and "Van".lower() == gt_name + ): valid_class = 0 else: valid_class = -1 ignore = False if ( - (gt_anno["occluded"][i] > MAX_OCCLUSION[difficulty]) - or (gt_anno["truncated"][i] > MAX_TRUNCATION[difficulty]) - or (height <= MIN_HEIGHT[difficulty]) - ): - # if gt_anno["difficulty"][i] > difficulty or gt_anno["difficulty"][i] == -1: + (gt_anno["occluded"][i] > max_occlusion) + or (gt_anno["truncated"][i] > max_truncation) + or (height <= min_height) + ): # filter extrim cases ignore = True if valid_class == 1 and not ignore: ignored_gt.append(0) @@ -108,74 +69,35 @@ def clean_data( ignored_gt.append(1) else: ignored_gt.append(-1) - # for i in range(num_gt): - if gt_anno["name"][i] == "DontCare": - dc_bboxes.append(gt_anno["bbox"][i]) + for i in range(num_dt): - if dt_anno["name"][i].lower() == current_class: - valid_class = 1 - else: - valid_class = -1 + valid_class = 1 if dt_anno["name"][i].lower() == current_class else -1 height = abs(dt_anno["bbox"][i, 3] - dt_anno["bbox"][i, 1]) - if height < MIN_HEIGHT[difficulty]: + if height < min_height: ignored_dt.append(1) elif valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) - return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes - - -@numba.jit(nopython=True) -def image_box_overlap( - boxes: np.ndarray, # shape: (N, 4) - query_boxes: np.ndarray, # shape: (K, 4) - criterion: int = -1, # default overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area -) -> np.ndarray: # shape: (N, K) - """Args: - boxes (np.ndarray): shape: (N, 4), 2D boxes, (x1, y1, x2, y2) - query_boxes (np.ndarray): shape: (K, 4), 2D boxes, (x1, y1, x2, y2) - criterion (int, optional): overlap criterion, -1: intersection over union, 0: intersection over box area, 1: intersection over query box area. Defaults to -1. - - Returns: - np.ndarray: shape: (N, K), overlap between boxes and query_boxes - """ - N = boxes.shape[0] - K = query_boxes.shape[0] - overlaps = np.zeros((N, K), dtype=boxes.dtype) - for k in range(K): - qbox_area = (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1]) - for n in range(N): - iw = min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) - if iw > 0: - ih = min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) - if ih > 0: - if criterion == -1: - ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih - elif criterion == 0: - ua = (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) - elif criterion == 1: - ua = qbox_area - else: - ua = 1.0 - overlaps[n, k] = iw * ih / ua - return overlaps + return num_valid_gt, ignored_gt, ignored_dt @numba.jit(nopython=True) def d3_box_overlap_kernel( - boxes: np.ndarray, # shape: (N, 7) - qboxes: np.ndarray, # shape: (K, 7) - rinc: np.ndarray, # shape: (N, K) + boxes: np.ndarray, # shape: (n, 7) + qboxes: np.ndarray, # shape: (k, 7) + rinc: np.ndarray, # shape: (n, k) criterion: int = -1, # default overlap criterion ) -> None: - """Args: - boxes: Array of shape (N, 7) representing N 3D boxes. - qboxes: Array of shape (K, 7) representing K 3D boxes. - rinc: Array of shape (N, K) representing the overlap between boxes + """Calculate 3D box overlap. + + Args: + boxes (np.ndarray): Array of shape (n, 7) representing n 3D boxes. + qboxes (np.ndarray): Array of shape (k, 7) representing k 3D boxes. + rinc (np.ndarray): Array of shape (n, k) representing the overlap between boxes and qboxes. - criterion: Overlap criterion. Defaults to -1. If -1, uses the + criterion (int, optional): Overlap criterion. Defaults to -1. If -1, uses the intersection-over-union (IoU) criterion. If 0, uses the intersection-over-area1 criterion. If 1, uses the intersection-over-area2 criterion. @@ -184,12 +106,10 @@ def d3_box_overlap_kernel( None """ # ONLY support overlap in CAMERA, not lidar. - N, K = boxes.shape[0], qboxes.shape[0] - for i in range(N): - for j in range(K): + n, k = boxes.shape[0], qboxes.shape[0] + for i in range(n): + for j in range(k): if rinc[i, j] > 0: - # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] + - # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1])) iw = min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4]) if iw > 0: @@ -216,12 +136,9 @@ def compute_statistics_jit( dt_datas: np.ndarray, # shape: (total_dt_num, 7) ignored_gt: list[int], # shape: (total_gt_num) ignored_det: list[int], # shape: (total_dt_num) - dc_bboxes: np.ndarray, # shape: (total_dc_num, 4) - metric: int, min_overlap: float, thresh: float = 0, compute_fp: bool = False, - compute_aos: bool = False, ) -> tuple[int, int, int, float, np.ndarray]: """This function computes statistics of an evaluation. @@ -231,12 +148,9 @@ def compute_statistics_jit( dt_datas (np.ndarray): Detection data. ignored_gt (List[int]): Ignore ground truth indices. ignored_det (List[int]): Ignore detection indices. - dc_bboxes (np.ndarray): Don't care bboxes. - metric (int): Evaluation metric. min_overlap (float): Minimum overlap between dt and gt bboxes. thresh (float): Detection score threshold. Defaults to 0. compute_fp (bool): Whether to compute false positives. Defaults to False. - compute_aos (bool): Whether to compute average orientation similarity. Defaults to False. Returns: Tuple[int, int, int, float, np.ndarray]: tp, fp, fn, similarity, thresholds @@ -244,29 +158,22 @@ def compute_statistics_jit( det_size = dt_datas.shape[0] gt_size = gt_datas.shape[0] dt_scores = dt_datas[:, -1] - dt_alphas = dt_datas[:, 4] - gt_alphas = gt_datas[:, 4] - dt_bboxes = dt_datas[:, :4] assigned_detection = [False] * det_size - ignored_threshold = [False] * det_size + ignored_obj_by_threshold = [False] * det_size if compute_fp: for i in range(det_size): if dt_scores[i] < thresh: - ignored_threshold[i] = True - NO_DETECTION = -10000000 + ignored_obj_by_threshold[i] = True + no_detection = -10000000 tp, fp, fn, similarity = 0, 0, 0, 0 - # thresholds = [0.0] - # delta = [0.0] - thresholds = np.zeros((gt_size,)) + tp_scores = np.zeros((gt_size,)) thresh_idx = 0 - delta = np.zeros((gt_size,)) - delta_idx = 0 for i in range(gt_size): if ignored_gt[i] == -1: continue det_idx = -1 - valid_detection = NO_DETECTION + valid_detection = no_detection max_overlap = 0 assigned_ignored_det = False @@ -275,7 +182,7 @@ def compute_statistics_jit( continue if assigned_detection[j]: continue - if ignored_threshold[j]: + if ignored_obj_by_threshold[j]: continue overlap = overlaps[j, i] dt_score = dt_scores[j] @@ -292,58 +199,32 @@ def compute_statistics_jit( det_idx = j valid_detection = 1 assigned_ignored_det = False - elif compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1: + elif compute_fp and (overlap > min_overlap) and (valid_detection == no_detection) and ignored_det[j] == 1: det_idx = j valid_detection = 1 assigned_ignored_det = True - if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: + if (valid_detection == no_detection) and ignored_gt[i] == 0: fn += 1 - elif (valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1): + elif (valid_detection != no_detection) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1): assigned_detection[det_idx] = True - elif valid_detection != NO_DETECTION: + elif valid_detection != no_detection: tp += 1 - # thresholds.append(dt_scores[det_idx]) - thresholds[thresh_idx] = dt_scores[det_idx] + + tp_scores[thresh_idx] = dt_scores[det_idx] thresh_idx += 1 - if compute_aos: - # delta.append(gt_alphas[i] - dt_alphas[det_idx]) - delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] - delta_idx += 1 assigned_detection[det_idx] = True if compute_fp: for i in range(det_size): - if not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i]): + if not ( + assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_obj_by_threshold[i] + ): fp += 1 nstuff = 0 - if metric == 0: - overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) - for i in range(dc_bboxes.shape[0]): - for j in range(det_size): - if assigned_detection[j]: - continue - if ignored_det[j] == -1 or ignored_det[j] == 1: - continue - if ignored_threshold[j]: - continue - if overlaps_dt_dc[j, i] > min_overlap: - assigned_detection[j] = True - nstuff += 1 fp -= nstuff - if compute_aos: - tmp = np.zeros((fp + delta_idx,)) - # tmp = [0] * fp - for i in range(delta_idx): - tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 - # tmp.append((1.0 + np.cos(delta[i])) / 2.0) - # assert len(tmp) == fp + tp - # assert len(delta) == tp - if tp > 0 or fp > 0: - similarity = np.sum(tmp) - else: - similarity = -1 - return tp, fp, fn, similarity, thresholds[:thresh_idx] + + return tp, fp, fn, similarity, tp_scores[:thresh_idx] @numba.jit(nopython=True) @@ -364,8 +245,7 @@ def get_split_parts(num: int, num_part: int) -> list[int]: if remain_num == 0: return [same_part] * num_part - else: - return [same_part] * num_part + [remain_num] + return [same_part] * num_part + [remain_num] @numba.jit(nopython=True) @@ -374,53 +254,42 @@ def fused_compute_statistics( pr: np.ndarray, # shape: (num_thresholds, 4) gt_nums: np.ndarray, # shape: (num_samples) dt_nums: np.ndarray, # shape: (num_samples) - dc_nums: np.ndarray, # shape: (num_samples) gt_datas: np.ndarray, # shape: (total_gt_num, 7) dt_datas: np.ndarray, # shape: (total_dt_num, 7) - dontcares: np.ndarray, # shape: (total_dc_num, 4) ignored_gts: np.ndarray, # shape: (total_gt_num) ignored_dets: np.ndarray, # shape: (total_dt_num) - metric: int, min_overlap: float, thresholds: np.ndarray, # shape: (num_thresholds) - compute_aos: bool = False, ) -> None: """Fast compute statistics. Must be used in CAMERA coordinate system. Args: - overlaps: 2D array of shape (total_dt_num, total_gt_num) - [dt_num, gt_num] is the overlap between dt_num-th detection - and gt_num-th ground truth - pr: 2D array of shape (num_thresholds, 4) - [t, 0] is the number of true positives at threshold t - [t, 1] is the number of false positives at threshold t - [t, 2] is the number of false negatives at threshold t - [t, 3] is the similarity at threshold t - gt_nums: 1D array of shape (num_samples) - gt_nums[i] is the number of ground truths in i-th sample - dt_nums: 1D array of shape (num_samples) - dt_nums[i] is the number of detections in i-th sample - dc_nums: 1D array of shape (num_samples) - dc_nums[i] is the number of dontcare areas in i-th sample - gt_datas: 2D array of shape (total_gt_num, 7) - gt_datas[i] is the i-th ground truth box - dt_datas: 2D array of shape (total_dt_num, 7) - dt_datas[i] is the i-th detection box - dontcares: 2D array of shape (total_dc_num, 4) - dontcares[i] is the i-th dontcare area - ignored_gts: 1D array of shape (total_gt_num) - ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise - ignored_dets: 1D array of shape (total_dt_num) - ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise - metric: Eval type. 0: bbox, 1: bev, 2: 3d - min_overlap: Min overlap - thresholds: 1D array of shape (num_thresholds) - thresholds[i] is the i-th threshold - compute_aos: Whether to compute aos + overlaps (np.ndarray): 2D array of shape (total_dt_num, total_gt_num), + [dt_num, gt_num] is the overlap between dt_num-th detection + and gt_num-th ground truth + pr (np.ndarray): 2D array of shape (num_thresholds, 4) + [t, 0] is the number of true positives at threshold t + [t, 1] is the number of false positives at threshold t + [t, 2] is the number of false negatives at threshold t + [t, 3] is the similarity at threshold t + gt_nums (np.ndarray): 1D array of shape (num_samples), + gt_nums[i] is the number of ground truths in i-th sample + dt_nums (np.ndarray): 1D array of shape (num_samples), + dt_nums[i] is the number of detections in i-th sample + gt_datas (np.ndarray): 2D array of shape (total_gt_num, 7), + gt_datas[i] is the i-th ground truth box + dt_datas (np.ndarray): 2D array of shape (total_dt_num, 7), + dt_datas[i] is the i-th detection box + ignored_gts (np.ndarray): 1D array of shape (total_gt_num), + ignored_gts[i] is 1 if the i-th ground truth is ignored, 0 otherwise + ignored_dets (np.ndarray): 1D array of shape (total_dt_num), + ignored_dets[i] is 1 if the i-th detection is ignored, 0 otherwise + min_overlap (float): Min overlap + thresholds (np.ndarray): 1D array of shape (num_thresholds), + thresholds[i] is the i-th threshold """ gt_num = 0 dt_num = 0 - dc_num = 0 for i in range(gt_nums.shape[0]): for t, thresh in enumerate(thresholds): overlap = overlaps[dt_num : dt_num + dt_nums[i], gt_num : gt_num + gt_nums[i]] @@ -428,19 +297,15 @@ def fused_compute_statistics( dt_data = dt_datas[dt_num : dt_num + dt_nums[i]] ignored_gt = ignored_gts[gt_num : gt_num + gt_nums[i]] ignored_det = ignored_dets[dt_num : dt_num + dt_nums[i]] - dontcare = dontcares[dc_num : dc_num + dc_nums[i]] tp, fp, fn, similarity, _ = compute_statistics_jit( overlap, gt_data, dt_data, ignored_gt, ignored_det, - dontcare, - metric, min_overlap=min_overlap, thresh=thresh, compute_fp=True, - compute_aos=compute_aos, ) pr[t, 0] += tp pr[t, 1] += fp @@ -449,22 +314,21 @@ def fused_compute_statistics( pr[t, 3] += similarity gt_num += gt_nums[i] dt_num += dt_nums[i] - dc_num += dc_nums[i] def calculate_iou_partly( gt_annos: list[dict[str, Any]], dt_annos: list[dict[str, Any]], - metric: int, num_parts: int = 50, ) -> tuple[list[np.ndarray], list[np.ndarray], np.ndarray, np.ndarray]: - """Fast iou algorithm. This function can be used independently to - do result analysis. Must be used in CAMERA coordinate system. + """Fast iou algorithm. + + This function can be used independently to do result analysis. + Must be used in CAMERA coordinate system. Args: gt_annos: List of dict, must from get_label_annos() in kitti_common.py dt_annos: List of dict, must from get_label_annos() in kitti_common.py - metric: Eval type. 0: bbox, 1: bev, 2: 3d num_parts: Int, a parameter for fast calculate algorithm Returns: @@ -475,12 +339,28 @@ def calculate_iou_partly( total_dt_num: Numpy array, shape (num_images,) """ - def d3_box_overlap(boxes, qboxes, criterion=-1): + def d3_box_overlap(boxes: np.ndarray, qboxes: np.ndarray, criterion: int = -1) -> np.ndarray: + """Calculate 3D box overlap. + + Args: + boxes (np.ndarray): Array of shape (n, 7) representing n 3D boxes. + qboxes (np.ndarray): Array of shape (k, 7) representing k 3D boxes. + criterion (int, optional): Overlap criterion. Defaults to -1. If -1, uses the + intersection-over-union (IoU) criterion. If 0, uses the + intersection-over-area1 criterion. If 1, uses the + intersection-over-area2 criterion. + + Returns: + np.ndarray: 1D array of shape (k, ) + """ rinc = rotate_iou_eval(boxes[:, [0, 2, 3, 5, 6]], qboxes[:, [0, 2, 3, 5, 6]], 2) d3_box_overlap_kernel(boxes, qboxes, rinc, criterion) return rinc - assert len(gt_annos) == len(dt_annos) + if len(gt_annos) != len(dt_annos): + msg = "gt_annos and dt_annos must have same length" + raise ValueError(msg) + total_dt_num = np.stack([len(a["name"]) for a in dt_annos], 0) total_gt_num = np.stack([len(a["name"]) for a in gt_annos], 0) num_examples = len(gt_annos) @@ -491,22 +371,17 @@ def d3_box_overlap(boxes, qboxes, criterion=-1): for num_part in split_parts: gt_annos_part = gt_annos[example_idx : example_idx + num_part] dt_annos_part = dt_annos[example_idx : example_idx + num_part] - if metric == 0: - gt_boxes = np.concatenate([a["bbox"] for a in gt_annos_part], 0) - dt_boxes = np.concatenate([a["bbox"] for a in dt_annos_part], 0) - overlap_part = image_box_overlap(gt_boxes, dt_boxes) - elif metric == 2: - loc = np.concatenate([a["location"] for a in gt_annos_part], 0) - dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) - rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) - gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) - loc = np.concatenate([a["location"] for a in dt_annos_part], 0) - dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) - rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) - dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) - overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) - else: - raise ValueError("unknown metric") + + loc = np.concatenate([a["location"] for a in gt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in gt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in gt_annos_part], 0) + gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + loc = np.concatenate([a["location"] for a in dt_annos_part], 0) + dims = np.concatenate([a["dimensions"] for a in dt_annos_part], 0) + rots = np.concatenate([a["rotation_y"] for a in dt_annos_part], 0) + dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) + overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) + parted_overlaps.append(overlap_part) example_idx += num_part overlaps = [] @@ -532,36 +407,30 @@ def _prepare_data( gt_annos: list[dict[str, Any]], dt_annos: list[dict[str, Any]], current_class: str, - difficulty: int, -) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], np.ndarray, int]: +) -> tuple[list[np.ndarray], list[np.ndarray], list[np.ndarray], list[np.ndarray], int]: """Prepare data for evaluation. Args: gt_annos (List[Dict[str, Any]]): Ground truth annotations. dt_annos (List[Dict[str, Any]]): Detection annotations. current_class (str): Current class name. - difficulty (int): Difficulty level. Returns: - Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray], np.ndarray, int]: - gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt + Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], + List[np.ndarray], int]: + gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, + total_num_valid_gt """ gt_datas_list = [] dt_datas_list = [] - total_dc_num = [] - ignored_gts, ignored_dets, dontcares = [], [], [] + ignored_gts, ignored_dets = [], [] total_num_valid_gt = 0 for i in range(len(gt_annos)): - rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) - num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets + rets = clean_data(gt_annos[i], dt_annos[i], current_class) + num_valid_gt, ignored_gt, ignored_det = rets ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) ignored_dets.append(np.array(ignored_det, dtype=np.int64)) - if len(dc_bboxes) == 0: - dc_bboxes = np.zeros((0, 4)).astype(np.float64) - else: - dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) - total_dc_num.append(dc_bboxes.shape[0]) - dontcares.append(dc_bboxes) + total_num_valid_gt += num_valid_gt gt_datas = np.concatenate([gt_annos[i]["bbox"], gt_annos[i]["alpha"][..., np.newaxis]], 1) dt_datas = np.concatenate( @@ -574,185 +443,134 @@ def _prepare_data( ) gt_datas_list.append(gt_datas) dt_datas_list.append(dt_datas) - total_dc_num = np.stack(total_dc_num, axis=0) - return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) + + return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, total_num_valid_gt) def eval_class( gt_annos: list[dict[str, Any]], dt_annos: list[dict[str, Any]], current_classes: list[str], - difficultys: list[int], - metric: int, min_overlaps: np.ndarray, - compute_aos: bool = False, num_parts: int = 50, + num_samples_pts: int = 41, ) -> dict[str, np.ndarray]: """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. Args: - gt_annos: dict, must from get_label_annos() in kitti_common.py - dt_annos: dict, must from get_label_annos() in kitti_common.py - current_classes: list of label names - difficultys: list of int. eval difficulty, 0: easy, 1: normal, 2: hard - metric: eval type. 0: bbox, 1: bev, 2: 3d - min_overlaps: float, min overlap. format: [num_overlap, metric, class]. - num_parts: int. a parameter for fast calculate algorithm + gt_annos (dict): must from get_label_annos() in kitti_common.py + dt_annos (dict): must from get_label_annos() in kitti_common.py + current_classes (list): label names + min_overlaps (float): min overlap. format: [num_overlap, class]. + num_parts (int): a parameter for fast calculate algorithm + num_samples_pts (int): number of points for precision-recall curve Returns: - dict of recall, precision and aos + dict of recall, precision """ - assert len(gt_annos) == len(dt_annos) num_examples = len(gt_annos) split_parts = get_split_parts(num_examples, num_parts) - rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts) + rets = calculate_iou_partly(dt_annos, gt_annos, num_parts) overlaps, parted_overlaps, total_dt_num, total_gt_num = rets - N_SAMPLE_PTS = 41 num_minoverlap = len(min_overlaps) num_class = len(current_classes) - num_difficulty = len(difficultys) - precision = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) - recall = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) - aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) + precision = np.zeros([num_class, num_minoverlap, num_samples_pts]) + recall = np.zeros([num_class, num_minoverlap, num_samples_pts]) for m, current_class in enumerate(current_classes): - for l, difficulty in enumerate(difficultys): - rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty) - ( - gt_datas_list, - dt_datas_list, - ignored_gts, - ignored_dets, - dontcares, - total_dc_num, - total_num_valid_gt, - ) = rets - for k, min_overlap in enumerate(min_overlaps[:, metric, m]): - thresholdss = [] - for i in range(len(gt_annos)): - rets = compute_statistics_jit( - overlaps[i], - gt_datas_list[i], - dt_datas_list[i], - ignored_gts[i], - ignored_dets[i], - dontcares[i], - metric, - min_overlap=min_overlap, - thresh=0.0, - compute_fp=False, - ) - tp, fp, fn, similarity, thresholds = rets - thresholdss += thresholds.tolist() - thresholdss = np.array(thresholdss) - thresholds = get_thresholds(thresholdss, total_num_valid_gt) - thresholds = np.array(thresholds) - pr = np.zeros([len(thresholds), 4]) - idx = 0 - for j, num_part in enumerate(split_parts): - gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0) - dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0) - dc_datas_part = np.concatenate(dontcares[idx : idx + num_part], 0) - ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0) - ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0) - fused_compute_statistics( - parted_overlaps[j], - pr, - total_gt_num[idx : idx + num_part], - total_dt_num[idx : idx + num_part], - total_dc_num[idx : idx + num_part], - gt_datas_part, - dt_datas_part, - dc_datas_part, - ignored_gts_part, - ignored_dets_part, - metric, - min_overlap=min_overlap, - thresholds=thresholds, - compute_aos=compute_aos, - ) - idx += num_part - for i in range(len(thresholds)): - recall[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) - precision[m, l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) - if compute_aos: - aos[m, l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1]) - for i in range(len(thresholds)): - precision[m, l, k, i] = np.max(precision[m, l, k, i:], axis=-1) - recall[m, l, k, i] = np.max(recall[m, l, k, i:], axis=-1) - if compute_aos: - aos[m, l, k, i] = np.max(aos[m, l, k, i:], axis=-1) - ret_dict = { + ( + gt_datas_list, + dt_datas_list, + ignored_gts, + ignored_dets, + total_num_valid_gt, + ) = _prepare_data(gt_annos, dt_annos, current_class) + for k, min_overlap in enumerate(min_overlaps[:, m]): + thresholdss = [] + for i in range(len(gt_annos)): + tp, fp, fn, similarity, thresholds = compute_statistics_jit( + overlaps[i], + gt_datas_list[i], + dt_datas_list[i], + ignored_gts[i], + ignored_dets[i], + min_overlap=min_overlap, + thresh=0.0, + compute_fp=False, + ) + thresholdss += thresholds.tolist() + if not thresholdss: + continue # no tp -> 0 precision and recall + # create thresholds between 0 and the max threshold, len(thresholds) == num_samples_pts + thresholds = np.linspace(0.0, np.max(thresholdss), num_samples_pts) + pr = np.zeros([len(thresholds), 4]) + idx = 0 + for j, num_part in enumerate(split_parts): + gt_datas_part = np.concatenate(gt_datas_list[idx : idx + num_part], 0) + dt_datas_part = np.concatenate(dt_datas_list[idx : idx + num_part], 0) + ignored_dets_part = np.concatenate(ignored_dets[idx : idx + num_part], 0) + ignored_gts_part = np.concatenate(ignored_gts[idx : idx + num_part], 0) + fused_compute_statistics( + parted_overlaps[j], + pr, + total_gt_num[idx : idx + num_part], + total_dt_num[idx : idx + num_part], + gt_datas_part, + dt_datas_part, + ignored_gts_part, + ignored_dets_part, + min_overlap=min_overlap, + thresholds=thresholds, + ) + idx += num_part + + for i in range(len(thresholds)): + recall[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) + precision[m, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 1]) + + return { "recall": recall, "precision": precision, - "orientation": aos, } - return ret_dict - - -def print_str(value, *arg, sstream=None): - if sstream is None: - sstream = sysio.StringIO() - sstream.truncate(0) - sstream.seek(0) - print(value, *arg, file=sstream) - return sstream.getvalue() def do_eval_cut_version( - gt_annos: list[dict[str, Any]], # type hint - dt_annos: list[dict[str, Any]], # type hint - current_classes: list[str], # type hint - min_overlaps: np.ndarray, # type hint - compute_aos: bool = False, # type hint -) -> tuple[float, float]: # type hint + gt_annos: list[dict[str, Any]], + dt_annos: list[dict[str, Any]], + current_classes: list[str], + min_overlaps: np.ndarray, +) -> np.ndarray: """Evaluates detections with COCO style AP. Args: - gt_annos (List[dict]): Ground truth annotations. - dt_annos (List[dict]): Detection results. - current_classes (List[str]): Classes to evaluate. + gt_annos (list[dict]): Ground truth annotations. + dt_annos (list[dict]): Detection results. + current_classes (list[str]): Classes to evaluate. min_overlaps (np.ndarray): Overlap ranges. - compute_aos (bool): Whether to compute aos. Returns: - Tuple[float, float]: Bounding box and 3D bounding box AP. + np.ndarray: 3D bounding box AP. """ - - def _get_mAP(prec: np.ndarray) -> np.ndarray: - sums = 0 - for i in range(0, prec.shape[-1], 4): - sums = sums + prec[..., i] - return sums / 11 * 100 - - # min_overlaps: [num_minoverlap, metric, num_class] - difficultys = [0, 1, 2] - ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos) - # ret: [num_class, num_diff, num_minoverlap, num_sample_points] - # get 2D bbox mAP - mAP_bbox = _get_mAP(ret["precision"]) - + # min_overlaps: [num_minoverlap, num_class] # get 3D bbox mAP - ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps) - mAP_3d = _get_mAP(ret["precision"]) - - return mAP_bbox, mAP_3d + ret = eval_class(gt_annos, dt_annos, current_classes, min_overlaps) + return np.mean(ret["precision"], axis=2) def get_coco_eval_result( gt_annos: list[dict], dt_annos: list[dict], current_classes: list[str], -) -> tuple[np.ndarray, np.ndarray]: +) -> np.ndarray: """Evaluates detections with COCO style AP. Args: - gt_annos (List[dict]): Ground truth annotations. - dt_annos (List[dict]): Detection results. - current_classes (List[str]): Classes to evaluate. + gt_annos (list[dict]): Ground truth annotations. + dt_annos (list[dict]): Detection results. + current_classes (list[str]): Classes to evaluate. Returns: - Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP. + np.ndarray: 3D bounding box AP. """ def do_coco_style_eval( @@ -760,52 +578,43 @@ def do_coco_style_eval( dt_annos: list[dict], current_classes: list[str], overlap_ranges: np.ndarray, - compute_aos: bool, - ) -> tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: """Evaluates detections with COCO style AP. Args: - gt_annos (List[dict]): Ground truth annotations. - dt_annos (List[dict]): Detection results. - current_classes (List[str]): Classes to evaluate. + gt_annos (list[dict]): Ground truth annotations. + dt_annos (list[dict]): Detection results. + current_classes (list[str]): Classes to evaluate. overlap_ranges (np.ndarray): Overlap ranges. - compute_aos (bool): Whether to compute aos. Returns: - Tuple[np.ndarray, np.ndarray]: Bounding box and 3D bounding box AP. + np.ndarray: 3D bounding box AP. """ min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) for i in range(overlap_ranges.shape[1]): - for j in range(overlap_ranges.shape[2]): - min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j][:2], 10) + min_overlaps[:, i] = np.linspace(*overlap_ranges[:, i], 10) - mAP_bbox, mAP_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos) + map_3d = do_eval_cut_version(gt_annos, dt_annos, current_classes, min_overlaps) - return mAP_bbox.mean(-1), mAP_3d.mean(-1) + result_str = "" - iou_range = [0.5, 0.95, 10] + for i, lbl in enumerate(current_classes): + result_str += f"\nclass: {lbl}\n" + "-" * len(f"class: {lbl}") + "\n" + for j, overlap in enumerate(min_overlaps): + result_str += f"AP@IoU={np.round(overlap[i],2)}: {np.round(map_3d[i][j] * 100, 2)}\n" + result_str += "\n" + logging.log(msg=result_str, level=logging.INFO) + + return map_3d.mean(0) + + iou_range = [0.5, 0.95] if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] - overlap_ranges = np.zeros([3, 3, len(current_classes)]) - for i, curcls in enumerate(current_classes): - # IoU from 0.5 to 0.95 - overlap_ranges[:, :, i] = np.array(iou_range)[:, np.newaxis] - result = "" - # check whether alpha is valid - compute_aos = False - mAPbbox, mAP3d = do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos) - - for j, curcls in enumerate(current_classes): - # mAP threshold array: [num_minoverlap, metric, class] - # mAP result: [num_class, num_diff, num_minoverlap] - o_range = np.array(iou_range)[[0, 2, 1]] - o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) - result += print_str(f"{curcls} " "coco AP@{:.2f}:{:.2f}:{:.2f}:".format(*o_range)) - result += print_str(f"bbox AP:{mAPbbox[j, 0]:.2f}, {mAPbbox[j, 1]:.2f}, {mAPbbox[j, 2]:.2f}") - result += print_str(f"3d AP:{mAP3d[j, 0]:.2f}, {mAP3d[j, 1]:.2f}, {mAP3d[j, 2]:.2f}") - - print("\n COCO style evaluation results: \n", result) - - return mAPbbox, mAP3d + overlap_ranges = np.zeros([2, len(current_classes)]) + for i in range(len(current_classes)): + # iou from 0.5 to 0.95 + overlap_ranges[:, i] = np.array(iou_range) + + return do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges) diff --git a/src/otx/core/model/base.py b/src/otx/core/model/base.py index 4ed1aa61ddc..a48325ca98c 100644 --- a/src/otx/core/model/base.py +++ b/src/otx/core/model/base.py @@ -124,7 +124,6 @@ def __init__( self.input_size = input_size self.classification_layers: dict[str, dict[str, Any]] = {} self.model = self._create_model() - self._explain_mode = False self.optimizer_callable = ensure_callable(optimizer) self.scheduler_callable = ensure_callable(scheduler) self.metric_callable = ensure_callable(metric) @@ -1097,11 +1096,6 @@ def model_adapter_parameters(self) -> dict: def _set_label_info(self, label_info: LabelInfoTypes) -> None: """Set this model label information.""" new_label_info = self._dispatch_label_info(label_info) - - if self._label_info != new_label_info: - msg = "OVModel strictly does not allow overwrite label_info if they are different each other." - raise ValueError(msg) - self._label_info = new_label_info def _create_label_info_from_ov_ir(self) -> LabelInfo: diff --git a/src/otx/core/model/detection.py b/src/otx/core/model/detection.py index fb6acccf12d..023b5268388 100644 --- a/src/otx/core/model/detection.py +++ b/src/otx/core/model/detection.py @@ -287,7 +287,7 @@ def _export_parameters(self) -> TaskLevelExportParameters: return super()._export_parameters.wrap( model_type="ssd", task_type="detection", - confidence_threshold=self.hparams.get("best_confidence_threshold", None), + confidence_threshold=max(0.35, self.hparams.get("best_confidence_threshold", 0.35)), iou_threshold=0.5, tile_config=self.tile_config if self.tile_config.enable_tiler else None, ) diff --git a/src/otx/core/model/detection_3d.py b/src/otx/core/model/detection_3d.py index caa0d14090f..0e73c73bcfa 100644 --- a/src/otx/core/model/detection_3d.py +++ b/src/otx/core/model/detection_3d.py @@ -5,23 +5,25 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, NamedTuple import numpy as np import torch +from model_api.models import ImageModel from torchvision.ops import box_convert +from otx.algo.object_detection_3d.utils.utils import box_cxcylrtb_to_xyxy from otx.algo.utils.mmengine_utils import load_checkpoint -from otx.core.data.dataset.utils.kitti_utils import class2angle -from otx.core.data.entity.base import ImageInfo +from otx.core.data.entity.base import ImageInfo, OTXBatchLossEntity from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity from otx.core.metrics import MetricInput from otx.core.metrics.average_precision_3d import KittiMetric -from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable, OTXModel, OVModel from otx.core.types.export import TaskLevelExportParameters if TYPE_CHECKING: from lightning.pytorch.cli import LRSchedulerCallable, OptimizerCallable + from model_api.adapters.inference_adapter import InferenceAdapter from torch import nn from otx.core.metrics import MetricCallable @@ -73,77 +75,86 @@ def _create_model(self) -> nn.Module: def _export_parameters(self) -> TaskLevelExportParameters: """Defines parameters required to export a particular model implementation.""" return super()._export_parameters.wrap( - model_type="ssd", - task_type="detection", + model_type="mono_3d_det", + task_type="3d_detection", ) - def _convert_pred_entity_to_compute_metric( + def _customize_inputs( self, - preds: Det3DBatchPredEntity, - inputs: Det3DBatchDataEntity, - ) -> MetricInput: - """Converts the prediction entity to the format required for computing metrics. - - Args: - preds (Det3DBatchPredEntity): Prediction entity. - inputs (Det3DBatchDataEntity): Input data entity. - """ - boxes = preds.boxes_3d - # bbox 2d decoding - xywh_2d = box_convert(preds.boxes, "xyxy", "cxcywh") - - xs3d = boxes[:, :, 0:1] - ys3d = boxes[:, :, 1:2] - xs2d = xywh_2d[:, :, 0:1] - ys2d = xywh_2d[:, :, 1:2] - - batch = len(boxes) - labels = preds.labels.view(batch, -1, 1) - scores = preds.scores.view(batch, -1, 1) - xs2d = xs2d.view(batch, -1, 1) - ys2d = ys2d.view(batch, -1, 1) - xs3d = xs3d.view(batch, -1, 1) - ys3d = ys3d.view(batch, -1, 1) - - detections = ( - torch.cat( - [ - labels, - scores, - xs2d, - ys2d, - preds.size_2d, - preds.depth[:, :, 0:1], - preds.heading_angle, - preds.size_3d, - xs3d, - ys3d, - torch.exp(-preds.depth[:, :, 1:2]), - ], - dim=2, - ) - .detach() - .cpu() - .numpy() - ) - - img_sizes = np.array([img_info.ori_shape for img_info in inputs.imgs_info]) - calib_matrix = [p2.detach().cpu().numpy() for p2 in inputs.calib_matrix] - result_list = self._decode_detections_for_kitti_format( - detections, - img_sizes, - calib_matrix, - class_names=self.label_info.label_names, - threshold=self.score_threshold, + entity: Det3DBatchDataEntity, + ) -> dict[str, Any]: + # prepare bboxes for the model + targets_list = [] + img_sizes = torch.from_numpy(np.array([img_info.ori_shape for img_info in entity.imgs_info])).to( + device=entity.images.device, ) + key_list = ["labels", "boxes", "depth", "size_3d", "heading_angle", "boxes_3d"] + for bz in range(len(entity.imgs_info)): + target_dict = {} + for key in key_list: + target_dict[key] = getattr(entity, key)[bz] + targets_list.append(target_dict) return { - "preds": result_list, - "target": inputs.original_kitti_format, # type: ignore[dict-item] + "images": entity.images, + "calibs": torch.cat([p2.unsqueeze(0) for p2 in entity.calib_matrix], dim=0), + "targets": targets_list, + "img_sizes": img_sizes, + "mode": "loss" if self.training else "predict", } + def _customize_outputs( + self, + outputs: dict[str, torch.Tensor], + inputs: Det3DBatchDataEntity, + ) -> Det3DBatchPredEntity | OTXBatchLossEntity: + if self.training: + if not isinstance(outputs, dict): + raise TypeError(outputs) + + losses = OTXBatchLossEntity() + for k, v in outputs.items(): + if isinstance(v, list): + losses[k] = sum(v) + elif isinstance(v, torch.Tensor): + losses[k] = v + else: + msg = "Loss output should be list or torch.tensor but got {type(v)}" + raise TypeError(msg) + return losses + + labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(outputs) + # bbox 2d decoding + boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d) + xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh") + # size 2d decoding + size_2d = xywh_2d[:, :, 2:4] + + return Det3DBatchPredEntity( + batch_size=inputs.batch_size, + images=inputs.images, + imgs_info=inputs.imgs_info, + calib_matrix=inputs.calib_matrix, + boxes=boxes_2d, + labels=labels, + boxes_3d=boxes_3d, + size_2d=size_2d, + size_3d=size_3d, + depth=depth, + heading_angle=heading_angle, + scores=scores, + original_kitti_format=[None], + ) + + def _convert_pred_entity_to_compute_metric( + self, + preds: Det3DBatchPredEntity, + inputs: Det3DBatchDataEntity, + ) -> MetricInput: + return _convert_pred_entity_to_compute_metric(preds, inputs, self.label_info.label_names, self.score_threshold) + @staticmethod - def _decode_detections_for_kitti_format( + def decode_detections_for_kitti_format( dets: np.ndarray, img_size: np.ndarray, calib_matrix: list[np.ndarray], @@ -153,7 +164,34 @@ def _decode_detections_for_kitti_format( """Decode the detection results for KITTI format.""" def _get_heading_angle(heading: np.ndarray) -> np.ndarray: - """Get heading angle from the prediction.""" + """Get heading angle from the prediction. + + Args: + heading (np.ndarray): The heading prediction. + + Returns: + np.ndarray: The heading angle in label format. + """ + + def class2angle(cls: int, residual: float, to_label_format: bool = False) -> float: + """Inverse function to angle2class. + + Args: + cls (int): The class index. + residual (float): The residual angle. + to_label_format (bool): Whether to return the angle in label format. + + Returns: + float: The angle in label format. + """ + num_heading_bin = 12 + angle_per_class = 2 * np.pi / float(num_heading_bin) + angle_center = cls * angle_per_class + angle = angle_center + residual + if to_label_format and angle > np.pi: + angle = angle - 2 * np.pi + return angle + heading_bin, heading_res = heading[0:12], heading[12:24] cls = np.argmax(heading_bin) res = heading_res[cls] @@ -203,10 +241,10 @@ def _img_to_rect(calib_matrix: np.ndarray, u: np.ndarray, v: np.ndarray, depth_r continue # 2d bboxs decoding - x = dets[i, j, 2] * img_size[i][0] - y = dets[i, j, 3] * img_size[i][1] - w = dets[i, j, 4] * img_size[i][0] - h = dets[i, j, 5] * img_size[i][1] + x = dets[i, j, 2] * img_size[i][1] + y = dets[i, j, 3] * img_size[i][0] + w = dets[i, j, 4] * img_size[i][1] + h = dets[i, j, 5] * img_size[i][0] bbox = [x - w / 2, y - h / 2, x + w / 2, y + h / 2] # 3d bboxs decoding @@ -217,8 +255,8 @@ def _img_to_rect(calib_matrix: np.ndarray, u: np.ndarray, v: np.ndarray, depth_r dimension = dets[i, j, 31:34] # positions decoding - x3d = dets[i, j, 34] * img_size[i][0] - y3d = dets[i, j, 35] * img_size[i][1] + x3d = dets[i, j, 34] * img_size[i][1] + y3d = dets[i, j, 35] * img_size[i][0] location = _img_to_rect(calib_matrix[i], x3d, y3d, depth).reshape(-1) location[1] += dimension[0] / 2 @@ -255,31 +293,7 @@ def get_dummy_input(self, batch_size: int = 1) -> Det3DBatchDataEntity: msg = f"Input size attribute is not set for {self.__class__}" raise ValueError(msg) - images = [torch.rand(3, *self.input_size) for _ in range(batch_size)] - calib_matrix = [torch.rand(3, 4) for _ in range(batch_size)] - infos = [] - for i, img in enumerate(images): - infos.append( - ImageInfo( - img_idx=i, - img_shape=img.shape, - ori_shape=img.shape, - ), - ) - return Det3DBatchDataEntity( - batch_size, - images, - infos, - boxes=[], - labels=[], - calib_matrix=calib_matrix, - boxes_3d=[], - size_2d=[], - size_3d=[], - depth=[], - heading_angle=[], - original_kitti_format=[], - ) + return _generate_dummy_input(self.input_size, batch_size) def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[str, int]]: """Get final classification layer information for incremental learning case.""" @@ -295,3 +309,347 @@ def get_classification_layers(self, prefix: str = "model.") -> dict[str, dict[st num_extra_classes = 6 * sample_model_dim - 5 * incremental_model_dim classification_layers[prefix + key] = {"stride": stride, "num_extra_classes": num_extra_classes} return classification_layers + + +class MonoDETRModel(ImageModel): + """A wrapper for MonoDETR 3d object detection model.""" + + __model__ = "mono_3d_det" + + def __init__(self, inference_adapter: InferenceAdapter, configuration: dict[str, Any], preload: bool = False): + """Initializes a 3d detection model. + + Args: + inference_adapter (InferenceAdapter): inference adapter containing the underlying model. + configuration (dict, optional): configuration overrides the model parameters (see parameters() method). + preload (bool, optional): forces inference adapter to load the model. Defaults to False. + """ + super().__init__(inference_adapter, configuration, preload) + self._check_io_number(3, 5) + + def preprocess(self, inputs: dict[str, np.ndarray]) -> tuple[dict[str, Any], ...]: + """Preprocesses the input data for the model. + + Args: + inputs (dict[str, np.ndarray]): a dict with image, calibration matrix, and image size + + Returns: + tuple[dict[str, Any], ...]: a tuple with the preprocessed inputs and meta information + """ + return { + self.image_blob_name: inputs["image"][None], + "calib_matrix": inputs["calib"], + "img_sizes": inputs["img_size"][None], + }, { + "original_shape": inputs["image"].shape, + "resized_shape": (self.h, self.w, self.c), + } + + def _get_inputs(self) -> tuple[list[Any], list[Any]]: + """Defines the model inputs for images and additional info. + + Raises: + WrapperError: if the wrapper failed to define appropriate inputs for images + + Returns: + - list of inputs names for images + - list of inputs names for additional info + """ + image_blob_names, image_info_blob_names = [], [] + for name, metadata in self.inputs.items(): + if len(metadata.shape) == 4: + image_blob_names.append(name) + elif len(metadata.shape) == 2: + image_info_blob_names.append(name) + + if not image_blob_names: + self.raise_error( + "Failed to identify the input for the image: no 4D input layer found", + ) + return image_blob_names, image_info_blob_names + + def postprocess( + self, + outputs: dict[str, np.ndarray], + meta: dict[str, Any], + ) -> dict[str, Any]: + """Applies SCC decoded to the model outputs. + + Args: + outputs (dict[str, np.ndarray]): raw outputs of the model + meta (dict[str, Any]): meta information about the input data + + Returns: + dict[str, Any]: postprocessed model outputs + """ + result = {} + for k in outputs: + result[k] = np.copy(outputs[k]) + + return result + + +class OV3DDetectionModel(OVModel[Det3DBatchDataEntity, Det3DBatchPredEntity]): + """3d detection model compatible for OpenVINO IR inference. + + It can consume OpenVINO IR model path or model name from Intel OMZ repository + and create the OTX 3d detection model compatible for OTX testing pipeline. + """ + + def __init__( + self, + model_name: str, + model_type: str = "mono_3d_det", + async_inference: bool = True, + max_num_requests: int | None = None, + use_throughput_mode: bool = True, + model_api_configuration: dict[str, Any] | None = None, + metric: MetricCallable = KittiMetric, + score_threshold: float = 0.2, + **kwargs, + ) -> None: + super().__init__( + model_name=model_name, + model_type=model_type, + async_inference=async_inference, + max_num_requests=max_num_requests, + use_throughput_mode=use_throughput_mode, + model_api_configuration=model_api_configuration, + metric=metric, + ) + self.score_threshold = score_threshold + + def _customize_inputs( + self, + entity: Det3DBatchDataEntity, + ) -> dict[str, Any]: + img_sizes = np.array([img_info.ori_shape for img_info in entity.imgs_info]) + images = [np.transpose(im.cpu().numpy(), (1, 2, 0)) for im in entity.images] + + return { + "images": images, + "calibs": [p2.unsqueeze(0).cpu().numpy() for p2 in entity.calib_matrix], + "targets": [], + "img_sizes": img_sizes, + "mode": "predict", + } + + def _customize_outputs( + self, + outputs: list[NamedTuple], + inputs: Det3DBatchDataEntity, + ) -> Det3DBatchPredEntity | OTXBatchLossEntity: + stacked_outputs: dict[str, Any] = {} + + for output in outputs: + for k in output: + if k in stacked_outputs: + stacked_outputs[k] = torch.cat((stacked_outputs[k], torch.tensor(output[k])), 0) + else: + stacked_outputs[k] = torch.tensor(output[k]) + + labels, scores, size_3d, heading_angle, boxes_3d, depth = self.extract_dets_from_outputs(stacked_outputs) + # bbox 2d decoding + boxes_2d = box_cxcylrtb_to_xyxy(boxes_3d) + xywh_2d = box_convert(boxes_2d, "xyxy", "cxcywh") + # size 2d decoding + size_2d = xywh_2d[:, :, 2:4] + + return Det3DBatchPredEntity( + batch_size=len(outputs), + images=inputs.images, + imgs_info=inputs.imgs_info, + calib_matrix=inputs.calib_matrix, + boxes=boxes_2d, + labels=labels, + boxes_3d=boxes_3d, + size_2d=size_2d, + size_3d=size_3d, + depth=depth, + heading_angle=heading_angle, + scores=scores, + original_kitti_format=[None], + ) + + def _forward(self, inputs: Det3DBatchDataEntity) -> Det3DBatchPredEntity: + """Model forward function.""" + all_inputs = self._customize_inputs(inputs) + + model_ready_inputs = [] + for image, calib, img_size in zip(all_inputs["images"], all_inputs["calibs"], all_inputs["img_sizes"]): + model_ready_inputs.append( + { + "image": image, + "calib": calib, + "img_size": img_size, + }, + ) + + if self.async_inference: + outputs = self.model.infer_batch(model_ready_inputs) + else: + outputs = [] + for model_input in model_ready_inputs: + outputs.append(self.model(model_input)) + + customized_outputs = self._customize_outputs(outputs, inputs) + + if isinstance(customized_outputs, OTXBatchLossEntity): + raise TypeError(customized_outputs) + + return customized_outputs + + def transform_fn(self, data_batch: Det3DBatchDataEntity) -> dict: + """Data transform function for PTQ.""" + all_inputs = self._customize_inputs(data_batch) + image = all_inputs["images"][0] + model = self.model + resized_image = model.resize(image, (model.w, model.h)) + resized_image = model.input_transform(resized_image) + + return { + "images": model._change_layout(resized_image), # noqa: SLF001, + "calib_matrix": all_inputs["calibs"][0], + "img_sizes": all_inputs["img_sizes"][0][None], + } + + @staticmethod + def extract_dets_from_outputs(outputs: dict[str, torch.Tensor], topk: int = 50) -> tuple[torch.Tensor, ...]: + """Extract detection results from model outputs.""" + # b, q, c + out_logits = outputs["scores"] + out_bbox = outputs["boxes_3d"] + + prob = out_logits + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), topk, dim=1) + + # final scores + scores = topk_values + # final indexes + topk_boxes = (topk_indexes // out_logits.shape[2]).unsqueeze(-1) + # final labels + labels = topk_indexes % out_logits.shape[2] + + heading = outputs["heading_angle"] + size_3d = outputs["size_3d"] + depth = outputs["depth"] + # decode boxes + boxes_3d = torch.gather(out_bbox, 1, topk_boxes.repeat(1, 1, 6)) # b, q', 4 + # heading angle decoding + heading = torch.gather(heading, 1, topk_boxes.repeat(1, 1, 24)) + # depth decoding + depth = torch.gather(depth, 1, topk_boxes.repeat(1, 1, 2)) + # 3d dims decoding + size_3d = torch.gather(size_3d, 1, topk_boxes.repeat(1, 1, 3)) + + return labels, scores, size_3d, heading, boxes_3d, depth + + def _convert_pred_entity_to_compute_metric( + self, + preds: Det3DBatchPredEntity, + inputs: Det3DBatchDataEntity, + ) -> MetricInput: + return _convert_pred_entity_to_compute_metric(preds, inputs, self.label_info.label_names, self.score_threshold) + + def get_dummy_input(self, batch_size: int = 1) -> Det3DBatchDataEntity: + """Returns a dummy input for 3d object detection model.""" + return _generate_dummy_input((224, 224), batch_size) + + +def _convert_pred_entity_to_compute_metric( + preds: Det3DBatchPredEntity, + inputs: Det3DBatchDataEntity, + label_names: list[str], + score_threshold: float, +) -> MetricInput: + """Converts the prediction entity to the format required for computing metrics. + + Args: + preds (Det3DBatchPredEntity): Prediction entity. + inputs (Det3DBatchDataEntity): Input data entity. + label_names (list[str]): List of label names. + score_threshold (float): Score threshold for filtering the predictions. + """ + boxes = preds.boxes_3d + # bbox 2d decoding + xywh_2d = box_convert(preds.boxes, "xyxy", "cxcywh") + + xs3d = boxes[:, :, 0:1] + ys3d = boxes[:, :, 1:2] + xs2d = xywh_2d[:, :, 0:1] + ys2d = xywh_2d[:, :, 1:2] + + batch = len(boxes) + labels = preds.labels.view(batch, -1, 1) + scores = preds.scores.view(batch, -1, 1) + xs2d = xs2d.view(batch, -1, 1) + ys2d = ys2d.view(batch, -1, 1) + xs3d = xs3d.view(batch, -1, 1) + ys3d = ys3d.view(batch, -1, 1) + + detections = ( + torch.cat( + [ + labels, + scores, + xs2d, + ys2d, + preds.size_2d, + preds.depth[:, :, 0:1], + preds.heading_angle, + preds.size_3d, + xs3d, + ys3d, + torch.exp(-preds.depth[:, :, 1:2]), + ], + dim=2, + ) + .detach() + .cpu() + .numpy() + ) + + img_sizes = np.array([img_info.ori_shape for img_info in inputs.imgs_info]) + calib_matrix = [p2.detach().cpu().numpy() for p2 in inputs.calib_matrix] + result_list = OTX3DDetectionModel.decode_detections_for_kitti_format( + detections, + img_sizes, + calib_matrix, + class_names=label_names, + threshold=score_threshold, + ) + + return { + "preds": result_list, + "target": inputs.original_kitti_format, # type: ignore[dict-item] + } + + +def _generate_dummy_input(input_size: tuple[int, ...], batch_size: int = 1) -> Det3DBatchDataEntity: + """Returns a dummy input for 3d object detection model.""" + images = torch.rand(batch_size, 3, *input_size) + calib_matrix = [torch.rand(3, 4) for _ in range(batch_size)] + infos = [] + for i, img in enumerate(images): + infos.append( + ImageInfo( + img_idx=i, + img_shape=img.shape[1:], + ori_shape=img.shape[1:], + ), + ) + + return Det3DBatchDataEntity( + batch_size, + images, + infos, + boxes=[torch.Tensor(0)] * batch_size, + labels=[torch.LongTensor(0)] * batch_size, + calib_matrix=calib_matrix, + boxes_3d=[torch.LongTensor(0)] * batch_size, + size_2d=[], + size_3d=[torch.LongTensor(0)] * batch_size, + depth=[torch.LongTensor(0)] * batch_size, + heading_angle=[torch.LongTensor(0)] * batch_size, + original_kitti_format=[], + ) diff --git a/src/otx/core/model/segmentation.py b/src/otx/core/model/segmentation.py index 9595b218231..a22cc15fbc4 100644 --- a/src/otx/core/model/segmentation.py +++ b/src/otx/core/model/segmentation.py @@ -5,6 +5,7 @@ from __future__ import annotations +import copy import json import logging as log from abc import abstractmethod @@ -126,7 +127,12 @@ def _build_model(self) -> nn.Module: """ def _customize_inputs(self, entity: SegBatchDataEntity) -> dict[str, Any]: - mode = "loss" if self.training else "predict" + if self.training: + mode = "loss" + elif self.explain_mode: + mode = "explain" + else: + mode = "predict" if self.train_type == OTXTrainType.SEMI_SUPERVISED and mode == "loss": if not isinstance(entity, dict): @@ -162,6 +168,16 @@ def _customize_outputs( losses[k] = v return losses + if self.explain_mode: + return SegBatchPredEntity( + batch_size=len(outputs["preds"]), + images=inputs.images, + imgs_info=inputs.imgs_info, + scores=[], + masks=outputs["preds"], + feature_vector=outputs["feature_vector"], + ) + return SegBatchPredEntity( batch_size=len(outputs), images=inputs.images, @@ -173,12 +189,20 @@ def _customize_outputs( @property def _export_parameters(self) -> TaskLevelExportParameters: """Defines parameters required to export a particular model implementation.""" + if self.label_info.label_names[0] == "otx_background_lbl": + # remove otx background label for export + modified_label_info = copy.deepcopy(self.label_info) + modified_label_info.label_names.pop(0) + else: + modified_label_info = self.label_info + return super()._export_parameters.wrap( model_type="Segmentation", task_type="segmentation", return_soft_prediction=True, soft_threshold=0.5, blur_strength=-1, + label_info=modified_label_info, tile_config=self.tile_config if self.tile_config.enable_tiler else None, ) @@ -199,7 +223,7 @@ def _exporter(self) -> OTXModelExporter: swap_rgb=False, via_onnx=False, onnx_export_configuration=None, - output_names=None, + output_names=["preds", "feature_vector"] if self.explain_mode else None, ) def _convert_pred_entity_to_compute_metric( @@ -207,6 +231,16 @@ def _convert_pred_entity_to_compute_metric( preds: SegBatchPredEntity, inputs: SegBatchDataEntity, ) -> MetricInput: + """Convert prediction and input entities to a format suitable for metric computation. + + Args: + preds (SegBatchPredEntity): The predicted segmentation batch entity containing predicted masks. + inputs (SegBatchDataEntity): The input segmentation batch entity containing ground truth masks. + + Returns: + MetricInput: A list of dictionaries where each dictionary contains 'preds' and 'target' keys + corresponding to the predicted and target masks for metric evaluation. + """ return [ { "preds": pred_mask, @@ -280,8 +314,26 @@ def forward_tiles(self, inputs: OTXTileBatchDataEntity[SegBatchDataEntity]) -> S def forward_for_tracing(self, image: Tensor) -> Tensor | dict[str, Tensor]: """Model forward function used for the model tracing during model exportation.""" - raw_outputs = self.model(inputs=image, mode="tensor") - return torch.softmax(raw_outputs, dim=1) + if self.explain_mode: + outputs = self.model(inputs=image, mode="explain") + outputs["preds"] = torch.softmax(outputs["preds"], dim=1) + return outputs + + outputs = self.model(inputs=image, mode="tensor") + return torch.softmax(outputs, dim=1) + + def forward_explain(self, inputs: SegBatchDataEntity) -> SegBatchPredEntity: + """Model forward explain function.""" + outputs = self.model(inputs=inputs.images, mode="explain") + + return SegBatchPredEntity( + batch_size=len(outputs["preds"]), + images=inputs.images, + imgs_info=inputs.imgs_info, + scores=[], + masks=outputs["preds"], + feature_vector=outputs["feature_vector"], + ) def get_dummy_input(self, batch_size: int = 1) -> SegBatchDataEntity: """Returns a dummy input for semantic segmentation model.""" @@ -371,25 +423,17 @@ def _customize_outputs( outputs: list[ImageResultWithSoftPrediction], inputs: SegBatchDataEntity, ) -> SegBatchPredEntity | OTXBatchLossEntity: - if outputs and outputs[0].saliency_map.size != 1: - predicted_s_maps = [out.saliency_map for out in outputs] - predicted_f_vectors = [out.feature_vector for out in outputs] - return SegBatchPredEntity( - batch_size=len(outputs), - images=inputs.images, - imgs_info=inputs.imgs_info, - scores=[], - masks=[tv_tensors.Mask(mask.resultImage, device=self.device) for mask in outputs], - saliency_map=predicted_s_maps, - feature_vector=predicted_f_vectors, - ) - + masks = [tv_tensors.Mask(mask.resultImage, device=self.device) for mask in outputs] + predicted_f_vectors = ( + [out.feature_vector for out in outputs] if outputs and outputs[0].feature_vector.size != 1 else [] + ) return SegBatchPredEntity( batch_size=len(outputs), images=inputs.images, imgs_info=inputs.imgs_info, scores=[], - masks=[tv_tensors.Mask(mask.resultImage, device=self.device) for mask in outputs], + masks=masks, + feature_vector=predicted_f_vectors, ) def _convert_pred_entity_to_compute_metric( @@ -397,6 +441,16 @@ def _convert_pred_entity_to_compute_metric( preds: SegBatchPredEntity, inputs: SegBatchDataEntity, ) -> MetricInput: + """Convert prediction and input entities to a format suitable for metric computation. + + Args: + preds (SegBatchPredEntity): The predicted segmentation batch entity containing predicted masks. + inputs (SegBatchDataEntity): The input segmentation batch entity containing ground truth masks. + + Returns: + MetricInput: A list of dictionaries where each dictionary contains 'preds' and 'target' keys + corresponding to the predicted and target masks for metric evaluation. + """ return [ { "preds": pred_mask, diff --git a/src/otx/core/types/label.py b/src/otx/core/types/label.py index 7f00aa0b496..c89f67d7fd6 100644 --- a/src/otx/core/types/label.py +++ b/src/otx/core/types/label.py @@ -169,10 +169,8 @@ def from_dm_label_groups(cls, dm_label_categories: LabelCategories) -> HLabelInf dm_label_categories (LabelCategories): the label categories of datumaro. """ - def get_exclusive_group_info(all_groups: list[Label | list[Label]]) -> dict[str, Any]: + def get_exclusive_group_info(exclusive_groups: list[Label | list[Label]]) -> dict[str, Any]: """Get exclusive group information.""" - exclusive_groups = [g for g in all_groups if len(g) > 1] - last_logits_pos = 0 num_single_label_classes = 0 head_idx_to_logits_range = {} @@ -193,12 +191,10 @@ def get_exclusive_group_info(all_groups: list[Label | list[Label]]) -> dict[str, } def get_single_label_group_info( - all_groups: list[Label | list[Label]], + single_label_groups: list[Label | list[Label]], num_exclusive_groups: int, ) -> dict[str, Any]: """Get single label group information.""" - single_label_groups = [g for g in all_groups if len(g) == 1] - class_to_idx = {} for i, group in enumerate(single_label_groups): @@ -256,24 +252,31 @@ def convert_labels_if_needed( label_names = [item.name for item in dm_label_categories.items] all_groups = convert_labels_if_needed(dm_label_categories, label_names) - exclusive_group_info = get_exclusive_group_info(all_groups) - single_label_group_info = get_single_label_group_info(all_groups, exclusive_group_info["num_multiclass_heads"]) + exclusive_groups = [g for g in all_groups if len(g) > 1] + exclusive_group_info = get_exclusive_group_info(exclusive_groups) + single_label_groups = [g for g in all_groups if len(g) == 1] + single_label_group_info = get_single_label_group_info( + single_label_groups, + exclusive_group_info["num_multiclass_heads"], + ) merged_class_to_idx = merge_class_to_idx( exclusive_group_info["class_to_idx"], single_label_group_info["class_to_idx"], ) + label_to_idx = {lbl: i for i, lbl in enumerate(merged_class_to_idx.keys())} + return HLabelInfo( label_names=label_names, - label_groups=all_groups, + label_groups=exclusive_groups + single_label_groups, num_multiclass_heads=exclusive_group_info["num_multiclass_heads"], num_multilabel_classes=single_label_group_info["num_multilabel_classes"], head_idx_to_logits_range=exclusive_group_info["head_idx_to_logits_range"], num_single_label_classes=exclusive_group_info["num_single_label_classes"], class_to_group_idx=merged_class_to_idx, - all_groups=all_groups, - label_to_idx=dm_label_categories._indices, # noqa: SLF001 + all_groups=exclusive_groups + single_label_groups, + label_to_idx=label_to_idx, label_tree_edges=get_label_tree_edges(dm_label_categories.items), empty_multiclass_head_indices=[], # consider the label removing case ) diff --git a/src/otx/engine/adaptive_bs/bs_search_algo.py b/src/otx/engine/adaptive_bs/bs_search_algo.py index a029d10aa6d..c0400eff284 100644 --- a/src/otx/engine/adaptive_bs/bs_search_algo.py +++ b/src/otx/engine/adaptive_bs/bs_search_algo.py @@ -112,8 +112,14 @@ def auto_decrease_batch_size(self) -> int: break if available_bs == 0: - msg = "Current device can't train model even with 2." - raise RuntimeError(msg) + if oom: + msg = "Current device can't train model even with 2." + raise RuntimeError(msg) + logger.warning( + "Even with a batch size of 2, most of the memory is used, " + "which could cause the training to fail midway.", + ) + available_bs = 2 return available_bs @@ -141,8 +147,14 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int: if oom or bs_mem_usage > self._mem_upper_bound: self._default_bs -= 2 if self._default_bs <= 0: - msg = "Current device can't train model even with 2." - raise RuntimeError(msg) + if oom: + msg = "Current device can't train model even with 2." + raise RuntimeError(msg) + logger.warning( + "Even with a batch size of 2, most of the memory is used, " + "which could cause the training to fail midway.", + ) + return 2 return self.auto_decrease_batch_size() diff --git a/src/otx/engine/engine.py b/src/otx/engine/engine.py index 47647caf7d6..8803870f3a4 100644 --- a/src/otx/engine/engine.py +++ b/src/otx/engine/engine.py @@ -5,6 +5,7 @@ from __future__ import annotations +import copy import csv import inspect import logging @@ -366,18 +367,32 @@ def test( # NOTE, trainer.test takes only lightning based checkpoint. # So, it can't take the OTX1.x checkpoint. if checkpoint is not None and not is_ir_ckpt: + kwargs_user_input: dict[str, Any] = {} + if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: + # to update user's custom infer_reference_info_root through cli for zero-shot learning + # TODO (sungchul): revisit for better solution + kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root) + model_cls = model.__class__ - model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **model.hparams) + model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **kwargs_user_input) if model.label_info != self.datamodule.label_info: - msg = ( - "To launch a test pipeline, the label information should be same " - "between the training and testing datasets. " - "Please check whether you use the same dataset: " - f"model.label_info={model.label_info}, " - f"datamodule.label_info={self.datamodule.label_info}" - ) - raise ValueError(msg) + if ( + self.task == "SEMANTIC_SEGMENTATION" + and "otx_background_lbl" in self.datamodule.label_info.label_names + and (len(self.datamodule.label_info.label_names) - len(model.label_info.label_names) == 1) + ): + # workaround for background label + model.label_info = copy.deepcopy(self.datamodule.label_info) + else: + msg = ( + "To launch a test pipeline, the label information should be same " + "between the training and testing datasets. " + "Please check whether you use the same dataset: " + f"model.label_info={model.label_info}, " + f"datamodule.label_info={self.datamodule.label_info}" + ) + raise ValueError(msg) self._build_trainer(**kwargs) @@ -453,8 +468,14 @@ def predict( datamodule = self._auto_configurator.update_ov_subset_pipeline(datamodule=datamodule, subset="test") if checkpoint is not None and not is_ir_ckpt: + kwargs_user_input: dict[str, Any] = {} + if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: + # to update user's custom infer_reference_info_root through cli for zero-shot learning + # TODO (sungchul): revisit for better solution + kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root) + model_cls = model.__class__ - model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **model.hparams) + model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **kwargs_user_input) if model.label_info != self.datamodule.label_info: msg = ( @@ -565,11 +586,17 @@ def export( ) if not is_ir_ckpt: + kwargs_user_input: dict[str, Any] = {} + if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: + # to update user's custom infer_reference_info_root through cli for zero-shot learning + # TODO (sungchul): revisit for better solution + kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root) + model_cls = self.model.__class__ self.model = model_cls.load_from_checkpoint( checkpoint_path=checkpoint, map_location="cpu", - **self.model.hparams, + **kwargs_user_input, ) self.model.eval() @@ -733,8 +760,14 @@ def explain( model = self._auto_configurator.get_ov_model(model_name=str(checkpoint), label_info=datamodule.label_info) if checkpoint is not None and not is_ir_ckpt: + kwargs_user_input: dict[str, Any] = {} + if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: + # to update user's custom infer_reference_info_root through cli for zero-shot learning + # TODO (sungchul): revisit for better solution + kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root) + model_cls = model.__class__ - model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **model.hparams) + model = model_cls.load_from_checkpoint(checkpoint_path=checkpoint, **kwargs_user_input) if model.label_info != self.datamodule.label_info: msg = ( @@ -836,11 +869,17 @@ def benchmark( ) if not is_ir_ckpt: + kwargs_user_input: dict[str, Any] = {} + if self.task == OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: + # to update user's custom infer_reference_info_root through cli for zero-shot learning + # TODO (sungchul): revisit for better solution + kwargs_user_input.update(infer_reference_info_root=self.model.infer_reference_info_root) + model_cls = self.model.__class__ self.model = model_cls.load_from_checkpoint( checkpoint_path=checkpoint, map_location="cpu", - **self.model.hparams, + **kwargs_user_input, ) elif isinstance(self.model, OVModel): msg = "To run benchmark on OV model, checkpoint must be specified." @@ -874,7 +913,7 @@ def dummy_infer(model: OTXModel, batch_size: int = 1) -> float: input_batch = self.model.get_dummy_input(1) model_fwd = lambda: self.model.forward(input_batch) depth = 3 if extended_stats else 0 - fwd_flops = measure_flops(self.model.model, model_fwd, print_stats_depth=depth) + fwd_flops = measure_flops(model_fwd, print_stats_depth=depth) flops_str = convert_num_with_suffix(fwd_flops, get_suffix_str(fwd_flops * 10**3)) final_stats["complexity"] = flops_str + " MACs" except Exception as e: diff --git a/src/otx/engine/hpo/hpo_api.py b/src/otx/engine/hpo/hpo_api.py index 9f470944266..9025cfd0fdc 100644 --- a/src/otx/engine/hpo/hpo_api.py +++ b/src/otx/engine/hpo/hpo_api.py @@ -9,6 +9,7 @@ import json import logging import time +from copy import copy from functools import partial from pathlib import Path from threading import Thread @@ -16,6 +17,7 @@ import torch import yaml +from lightning import Callback from otx.core.config.hpo import HpoConfig from otx.core.optimizer.callable import OptimizerCallableSupportHPO @@ -35,7 +37,6 @@ from .utils import find_trial_file, get_best_hpo_weight, get_callable_args_name, get_hpo_weight_dir, get_metric if TYPE_CHECKING: - from lightning import Callback from lightning.pytorch.cli import OptimizerCallable from otx.engine.engine import Engine @@ -48,7 +49,6 @@ def execute_hpo( engine: Engine, max_epochs: int, hpo_config: HpoConfig, - progress_update_callback: Callable[[int | float], None] | None = None, callbacks: list[Callback] | Callback | None = None, **train_args, ) -> tuple[dict[str, Any] | None, Path | None]: @@ -58,8 +58,6 @@ def execute_hpo( engine (Engine): engine instnace. max_epochs (int): max epochs to train. hpo_config (HpoConfig): Configuration for HPO. - progress_update_callback (Callable[[int | float], None] | None, optional): - callback to update progress. If it's given, it's called with progress every second. Defaults to None. callbacks (list[Callback] | Callback | None, optional): callbacks used during training. Defaults to None. Returns: @@ -97,8 +95,23 @@ def execute_hpo( logger.warning("HPO is skipped.") return None, None - if progress_update_callback is not None: - Thread(target=_update_hpo_progress, args=[progress_update_callback, hpo_algo], daemon=True).start() + if hpo_config.progress_update_callback is not None: + Thread(target=_update_hpo_progress, args=[hpo_config.progress_update_callback, hpo_algo], daemon=True).start() + + if hpo_config.callbacks_to_exclude is not None and callbacks is not None: + if isinstance(hpo_config.callbacks_to_exclude, str): + hpo_config.callbacks_to_exclude = [hpo_config.callbacks_to_exclude] + if isinstance(callbacks, Callback): + callbacks = [callbacks] + + callbacks = copy(callbacks) + callback_names = [callback.__class__.__name__ for callback in callbacks] + callback_idx_to_exclude = [ + callback_names.index(cb_name) for cb_name in hpo_config.callbacks_to_exclude if cb_name in callback_names + ] + sorted(callback_idx_to_exclude, reverse=True) + for idx in callback_idx_to_exclude: + callbacks.pop(idx) run_hpo_loop( hpo_algo, diff --git a/src/otx/engine/utils/auto_configurator.py b/src/otx/engine/utils/auto_configurator.py index 6207d33f342..f875fb6b9b7 100644 --- a/src/otx/engine/utils/auto_configurator.py +++ b/src/otx/engine/utils/auto_configurator.py @@ -94,6 +94,7 @@ OTXTaskType.ANOMALY_DETECTION: "otx.algo.anomaly.openvino_model.AnomalyOpenVINO", OTXTaskType.ANOMALY_SEGMENTATION: "otx.algo.anomaly.openvino_model.AnomalyOpenVINO", OTXTaskType.KEYPOINT_DETECTION: "otx.core.model.keypoint_detection.OVKeypointDetectionModel", + OTXTaskType.OBJECT_DETECTION_3D: "otx.core.model.detection_3d.OV3DDetectionModel", } diff --git a/src/otx/recipe/_base_/data/anomaly.yaml b/src/otx/recipe/_base_/data/anomaly.yaml index 2f74b987915..dd3a4f244c6 100644 --- a/src/otx/recipe/_base_/data/anomaly.yaml +++ b/src/otx/recipe/_base_/data/anomaly.yaml @@ -1,5 +1,5 @@ task: ANOMALY_CLASSIFICATION -input_size: 256 +input_size: [256, 256] data_format: mvtec mem_cache_size: 1GB mem_cache_img_max_size: null @@ -13,11 +13,10 @@ train_subset: batch_size: 32 num_workers: 4 transforms: - - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge + - class_path: torchvision.transforms.v2.Resize init_args: - size: $(input_size) + size: [256, 256] antialias: true - - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare - class_path: torchvision.transforms.v2.ToDtype init_args: dtype: ${as_torch_dtype:torch.float32} @@ -36,11 +35,10 @@ val_subset: batch_size: 32 num_workers: 4 transforms: - - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge + - class_path: torchvision.transforms.v2.Resize init_args: - size: $(input_size) + size: [256, 256] antialias: true - - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare - class_path: torchvision.transforms.v2.ToDtype init_args: dtype: ${as_torch_dtype:torch.float32} @@ -59,11 +57,10 @@ test_subset: batch_size: 32 num_workers: 4 transforms: - - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge + - class_path: torchvision.transforms.v2.Resize init_args: - size: $(input_size) + size: [256, 256] antialias: true - - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare - class_path: torchvision.transforms.v2.ToDtype init_args: dtype: ${as_torch_dtype:torch.float32} diff --git a/src/otx/recipe/_base_/data/object_detection_3d.yaml b/src/otx/recipe/_base_/data/object_detection_3d.yaml index a7c773f1bcf..90b0527ada5 100644 --- a/src/otx/recipe/_base_/data/object_detection_3d.yaml +++ b/src/otx/recipe/_base_/data/object_detection_3d.yaml @@ -12,9 +12,20 @@ train_subset: subset_name: train transform_lib_type: TORCHVISION batch_size: 8 - num_workers: 4 + num_workers: 2 to_tv_image: false transforms: + - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms + init_args: + input_size: $(input_size) + random_horizontal_flip: true + random_crop: true + p_crop: 0.5 + random_scale: 0.05 + random_shift: 0.05 + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} - class_path: torchvision.transforms.v2.Normalize init_args: mean: [123.675, 116.28, 103.53] @@ -27,9 +38,16 @@ val_subset: subset_name: val transform_lib_type: TORCHVISION batch_size: 16 - num_workers: 4 + num_workers: 2 to_tv_image: false transforms: + - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms + init_args: + input_size: $(input_size) + decode_annotations: false + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} - class_path: torchvision.transforms.v2.Normalize init_args: mean: [123.675, 116.28, 103.53] @@ -41,9 +59,16 @@ test_subset: subset_name: test transform_lib_type: TORCHVISION batch_size: 16 - num_workers: 4 + num_workers: 2 to_tv_image: false transforms: + - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms + init_args: + input_size: $(input_size) + decode_annotations: false + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} - class_path: torchvision.transforms.v2.Normalize init_args: mean: [123.675, 116.28, 103.53] diff --git a/src/otx/recipe/object_detection_3d/monodetr3d.yaml b/src/otx/recipe/object_detection_3d/monodetr3d.yaml index 032c71ffbf8..ec5aaa005eb 100644 --- a/src/otx/recipe/object_detection_3d/monodetr3d.yaml +++ b/src/otx/recipe/object_detection_3d/monodetr3d.yaml @@ -20,13 +20,13 @@ model: mode: max factor: 0.1 patience: 13 - monitor: val/mAP_bbox_2d + monitor: val/AP_2d@0.5 engine: task: OBJECT_DETECTION_3D device: auto -callback_monitor: val/mAP_bbox_3d +callback_monitor: val/AP_3d@0.5 data: ../_base_/data/object_detection_3d.yaml diff --git a/src/otx/recipe/object_detection_3d/openvino_model.yaml b/src/otx/recipe/object_detection_3d/openvino_model.yaml new file mode 100644 index 00000000000..62265f06f6e --- /dev/null +++ b/src/otx/recipe/object_detection_3d/openvino_model.yaml @@ -0,0 +1,43 @@ +model: + class_path: otx.core.model.detection_3d.OV3DDetectionModel + init_args: + label_info: 3 + model_name: monodetr-001 + model_type: "mono_3d_det" + async_inference: true + use_throughput_mode: true + +engine: + task: OBJECT_DETECTION_3D + device: cpu + +callback_monitor: val/mAP_bbox_2d + +data: ../_base_/data/object_detection_3d.yaml +overrides: + reset: + - data.train_subset.transforms + - data.val_subset.transforms + - data.test_subset.transforms + + data: + stack_images: false + train_subset: + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms + init_args: + decode_annotations: false + + val_subset: + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms + init_args: + decode_annotations: false + + test_subset: + to_tv_image: false + batch_size: 64 + transforms: + - class_path: otx.core.data.transform_libs.torchvision.Decode3DInputsAffineTransforms + init_args: + decode_annotations: false diff --git a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml index 1909a5434a6..81cd040685a 100644 --- a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml +++ b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b.yaml @@ -1,7 +1,7 @@ model: class_path: otx.core.model.rotated_detection.RotatedMaskRCNNModel init_args: - model_name: efficientnet_b2b + model_name: maskrcnn_efficientnet_b2b label_info: 80 optimizer: diff --git a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml index 90b5914df79..9a950613d6b 100644 --- a/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml +++ b/src/otx/recipe/rotated_detection/maskrcnn_efficientnetb2b_tile.yaml @@ -1,7 +1,7 @@ model: class_path: otx.core.model.rotated_detection.RotatedMaskRCNNModel init_args: - model_name: efficientnet_b2b + model_name: maskrcnn_efficientnet_b2b label_info: 80 optimizer: diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py index cb74298e910..e476a35e1e4 100644 --- a/src/otx/tools/converter.py +++ b/src/otx/tools/converter.py @@ -156,7 +156,6 @@ "task": OTXTaskType.SEMANTIC_SEGMENTATION, "model_name": "dino_v2", }, - # ANOMALY_CLASSIFICATION # ANOMALY "ote_anomaly_padim": { "task": OTXTaskType.ANOMALY, @@ -193,6 +192,15 @@ "task": OTXTaskType.ANOMALY_SEGMENTATION, "model_name": "stfpm", }, + # KEYPOINT_DETECTION + "Custom_Keypoint_Detection_Rtmpose_T": { + "task": OTXTaskType.KEYPOINT_DETECTION, + "model_name": "rtmpose_tiny", + }, + "Custom_Keypoint_Detection_Rtmpose_T_Single_Obj": { + "task": OTXTaskType.KEYPOINT_DETECTION, + "model_name": "rtmpose_tiny_single_obj", + }, } diff --git a/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18/template.yaml b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18/template.yaml new file mode 100644 index 00000000000..f0e296c6ce0 --- /dev/null +++ b/src/otx/tools/templates/segmentation/ocr_lite_hrnet_18/template.yaml @@ -0,0 +1,46 @@ +# Description. +model_template_id: Custom_Semantic_Segmentation_Lite-HRNet-18_OCR +name: Lite-HRNet-18 +task_type: SEGMENTATION +task_family: VISION +instantiation: "CLASS" +summary: Class-Incremental Semantic Segmentation with middle-sized architecture which based on the Lite-HRNet backbone for the balance between the fast inference and long training. (OBSOLETE, please use Lite-HRNet-18-mod2 instead) +application: ~ + +# Algo backend. +framework: OTXSegmentation v0.14.0 + +# Capabilities. +capabilities: + - compute_representations + +# Hyperparameters. +hyper_parameters: + base_path: ../configuration.yaml + parameter_overrides: + learning_parameters: + batch_size: + default_value: 8 + learning_rate: + default_value: 0.001 + auto_hpo_state: POSSIBLE + learning_rate_warmup_iters: + default_value: 100 + num_iters: + default_value: 300 + algo_backend: + train_type: + default_value: Incremental + +# Training resources. +max_nodes: 1 +training_targets: + - GPU + - CPU + +# Stats. +gigaflops: 3.45 +size: 4.5 + +# Model spec +model_status: OBSOLETE diff --git a/src/otx/utils/utils.py b/src/otx/utils/utils.py index e084eb7bd95..c1e735201e7 100644 --- a/src/otx/utils/utils.py +++ b/src/otx/utils/utils.py @@ -263,7 +263,6 @@ def check_pickleable(obj: Any) -> bool: # noqa: ANN401 def measure_flops( - model: torch.nn.Module, forward_fn: Callable[[], torch.Tensor], loss_fn: Callable[[torch.Tensor], torch.Tensor] | None = None, print_stats_depth: int = 0, @@ -271,7 +270,7 @@ def measure_flops( """Utility to compute the total number of FLOPs used by a module during training or during inference.""" from torch.utils.flop_counter import FlopCounterMode - flop_counter = FlopCounterMode(model, display=print_stats_depth > 0, depth=print_stats_depth) + flop_counter = FlopCounterMode(display=print_stats_depth > 0, depth=print_stats_depth) with flop_counter: if loss_fn is None: forward_fn() diff --git a/tests/e2e/cli/test_cli.py b/tests/e2e/cli/test_cli.py index 19424f722de..3078784a8fd 100644 --- a/tests/e2e/cli/test_cli.py +++ b/tests/e2e/cli/test_cli.py @@ -52,6 +52,8 @@ def test_otx_e2e_cli( if task == OTXTaskType.INSTANCE_SEGMENTATION: is_tiling = "tile" in recipe dataset_path = fxt_target_dataset_per_task[task]["tiling" if is_tiling else "non_tiling"] + elif task == OTXTaskType.KEYPOINT_DETECTION: + dataset_path = fxt_target_dataset_per_task[task][model_name] else: dataset_path = fxt_target_dataset_per_task[task] @@ -138,7 +140,7 @@ def test_otx_e2e_cli( ExportCase2Test("ONNX", False, "exported_model_decoder.onnx"), ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"), ] - elif "ANOMALY" in task or OTXTaskType.KEYPOINT_DETECTION in task: + elif task in ("ANOMALY", OTXTaskType.KEYPOINT_DETECTION, OTXTaskType.OBJECT_DETECTION_3D): fxt_export_list = [ ExportCase2Test("ONNX", False, "exported_model.onnx"), ExportCase2Test("OPENVINO", False, "exported_model.xml"), @@ -178,6 +180,9 @@ def test_otx_e2e_cli( assert latest_dir.exists() assert (latest_dir / export_case.expected_output).exists() + if task == OTXTaskType.OBJECT_DETECTION_3D: + return # "3D Object Detection is not supported for OV IR inference. + # 4) infer of the exported models ov_output_dir = tmp_path_test / "outputs" / "OPENVINO" ov_files = list(ov_output_dir.rglob("exported*.xml")) @@ -220,8 +225,8 @@ def test_otx_e2e_cli( # 5) otx export with XAI if "instance_segmentation/rtmdet_inst_tiny" in recipe: return - if ("_cls" not in task) and (task not in ["detection", "instance_segmentation"]): - return # Supported only for classification, detection and instance segmentation task. + if ("_cls" not in task) and (task not in ["detection", "instance_segmentation", "semantic_segmentation"]): + return # Supported only for classification, detection and segmentation tasks. unsupported_models = ["dino", "rtdetr"] if any(model in model_name for model in unsupported_models): @@ -302,6 +307,24 @@ def test_otx_explain_e2e_cli( ]: pytest.skip("Supported only for classification, detection and instance segmentation task.") + models_not_supported = [ + "dino", + "yolov9_s", + "yolov9_c", + "rtdetr_18", + "rtdetr_18_tile", + "rtdetr_50_tile", + "yolov9_m", + "rtdetr_101_tile", + "rtdetr_50", + "rtdetr_101", + "maskrcnn_r50_tv", + "maskrcnn_r50_tv_tile", + ] + + if any(model in model_name for model in models_not_supported): + pytest.skip(f"{model_name} is not supported.") + deterministic = "True" if task == OTXTaskType.INSTANCE_SEGMENTATION: # Determinism is not required for this test for instance_segmentation models. @@ -314,9 +337,6 @@ def test_otx_explain_e2e_cli( if isinstance(dataset_path, dict) and "supervised" in dataset_path: dataset_path = dataset_path["supervised"] - if "dino" in model_name: - pytest.skip("DINO is not supported.") - # otx explain tmp_path_explain = tmp_path / f"otx_explain_{model_name}" command_cfg = [ @@ -431,6 +451,8 @@ def test_otx_hpo_e2e_cli( if task == OTXTaskType.INSTANCE_SEGMENTATION: dataset_path = fxt_target_dataset_per_task[task]["non_tiling"] + elif task == OTXTaskType.KEYPOINT_DETECTION: + dataset_path = fxt_target_dataset_per_task[task]["rtmpose_tiny"] else: dataset_path = fxt_target_dataset_per_task[task] diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 0dced369ee4..e3778734388 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -99,7 +99,7 @@ def fxt_target_dataset_per_task(fxt_ci_data_root) -> dict: OTXTaskType.MULTI_LABEL_CLS: Path(fxt_ci_data_root / "v2/multilabel_classification/multilabel_CUB_small/1"), OTXTaskType.H_LABEL_CLS: Path(fxt_ci_data_root / "v2/hlabel_classification/hlabel_CUB_small/1"), OTXTaskType.DETECTION: Path(fxt_ci_data_root / "v2/detection/bdd_small/1"), - OTXTaskType.ROTATED_DETECTION: Path(fxt_ci_data_root / "v2/rotated_detection/subway"), + OTXTaskType.ROTATED_DETECTION: Path(fxt_ci_data_root / "v2/rotated_detection/sample"), OTXTaskType.INSTANCE_SEGMENTATION: { "non_tiling": Path(fxt_ci_data_root / "v2/instance_seg/wgisd_small/1"), "tiling": Path(fxt_ci_data_root / "v2/tiling_instance_seg/vitens_aeromonas_small/1"), @@ -109,7 +109,7 @@ def fxt_target_dataset_per_task(fxt_ci_data_root) -> dict: "unlabeled": Path(fxt_ci_data_root / "v2/semantic_seg/semi-sl/unlabeled_images/kvasir"), }, OTXTaskType.ACTION_CLASSIFICATION: Path( - fxt_ci_data_root / "v2/action/action_classification/ucf_kinetics_5percent_small", + fxt_ci_data_root / "v2/action/action_classification/ucf_kinetics_30percent_medium", ), OTXTaskType.VISUAL_PROMPTING: Path(fxt_ci_data_root / "v2/visual_prompting/coco_car_person_medium"), OTXTaskType.ZERO_SHOT_VISUAL_PROMPTING: Path( @@ -119,7 +119,11 @@ def fxt_target_dataset_per_task(fxt_ci_data_root) -> dict: OTXTaskType.ANOMALY_CLASSIFICATION: Path(fxt_ci_data_root / "v2/anomaly/mvtec/hazelnut_large"), OTXTaskType.ANOMALY_DETECTION: Path(fxt_ci_data_root / "v2/anomaly/mvtec/hazelnut_large"), OTXTaskType.ANOMALY_SEGMENTATION: Path(fxt_ci_data_root / "v2/anomaly/mvtec/hazelnut_large"), - OTXTaskType.KEYPOINT_DETECTION: Path(fxt_ci_data_root / "v2/keypoint_detection/coco_keypoint_medium"), + OTXTaskType.KEYPOINT_DETECTION: { + "rtmpose_tiny": Path(fxt_ci_data_root / "v2/keypoint_detection/coco_keypoint/medium"), + "rtmpose_tiny_single_obj": Path(fxt_ci_data_root / "v2/keypoint_detection/coco_keypoint_single_obj/medium"), + }, + OTXTaskType.OBJECT_DETECTION_3D: Path(fxt_ci_data_root / "v2/object_detection_3d/medium_pedestrian_cyclist"), } @@ -141,4 +145,5 @@ def fxt_cli_override_command_per_task() -> dict: OTXTaskType.ANOMALY_DETECTION: [], OTXTaskType.ANOMALY_SEGMENTATION: [], OTXTaskType.KEYPOINT_DETECTION: [], + OTXTaskType.OBJECT_DETECTION_3D: [], } diff --git a/tests/integration/cli/test_cli.py b/tests/integration/cli/test_cli.py index f571dc2ed2c..fd7a9438aea 100644 --- a/tests/integration/cli/test_cli.py +++ b/tests/integration/cli/test_cli.py @@ -159,6 +159,10 @@ def test_otx_e2e( ExportCase2Test("OPENVINO", False, "exported_model_decoder.xml"), ] # TODO (sungchul): EXPORTABLE_CODE will be supported + if task == "object_detection_3d": + # exportable code and demo package are not supported for OD 3D + fxt_export_list.pop(-1) + overrides = fxt_cli_override_command_per_task[task] tmp_path_test = tmp_path / f"otx_test_{model_name}" @@ -191,14 +195,6 @@ def test_otx_e2e( assert latest_dir.exists() assert (latest_dir / export_case.expected_output).exists() - if "keypoint" in recipe: - print("Inference and explain are not supported for keypoint detection") - return - - if "monodetr3d" in recipe: - print("Inference and explain are not supported for object detection 3d") - return - # 4) infer of the exported models ov_output_dir = tmp_path_test / "outputs" / "OPENVINO" ov_files = list(ov_output_dir.rglob("exported*.xml")) @@ -250,8 +246,8 @@ def test_otx_e2e( # 5) otx export with XAI if "instance_segmentation/rtmdet_inst_tiny" in recipe: return - if ("_cls" not in task) and (task not in ["detection", "instance_segmentation"]): - return # Supported only for classification, detection and instance segmentation task. + if ("_cls" not in task) and (task not in ["detection", "instance_segmentation", "semantic_segmentation"]): + return # Supported only for classification, detection and segmentation tasks. if "dino" in model_name: return # DINO is not supported. @@ -261,10 +257,15 @@ def test_otx_e2e( if "yolov9" in model_name: return # RT-DETR currently is not supported. + if "keypoint" in recipe: print("Explain is not supported for keypoint detection") return + if "monodetr3d" in recipe: + print("Explain is not supported for object detection 3d") + return + tmp_path_test = tmp_path / f"otx_export_xai_{model_name}" for export_case in fxt_export_list: command_cfg = [ diff --git a/tests/integration/cli/test_export_inference.py b/tests/integration/cli/test_export_inference.py index f39547ca81a..1d455616c4f 100644 --- a/tests/integration/cli/test_export_inference.py +++ b/tests/integration/cli/test_export_inference.py @@ -49,6 +49,7 @@ def fxt_local_seed() -> int: "zero_shot_visual_prompting": "test/f1-score", "action_classification": "test/accuracy", "keypoint_detection": "test/PCK", + "object_detection_3d": "test/AP_3d@0.5", } diff --git a/tests/perf/test_object_detection_3d.py b/tests/perf/test_object_detection_3d.py index 74a5bb43ca9..2fae45c8221 100644 --- a/tests/perf/test_object_detection_3d.py +++ b/tests/perf/test_object_detection_3d.py @@ -40,10 +40,14 @@ class TestPerfObjectDetection3D(PerfTestBase): BENCHMARK_CRITERIA = [ # noqa: RUF012 Benchmark.Criterion(name="train/epoch", summary="max", compare="<", margin=0.1), Benchmark.Criterion(name="train/e2e_time", summary="max", compare="<", margin=0.1), - Benchmark.Criterion(name="val/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="test/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="export/accuracy", summary="max", compare=">", margin=0.1), - Benchmark.Criterion(name="optimize/accuracy", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="val/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="val/AP_2d@0.5", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="test/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="test/AP_2d@0.5", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="export/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="export/AP_2d@0.5", summary="max", compare=">", margin=0.1), + Benchmark.Criterion(name="optimize/AP_3d@0.5", summary="max", compare=">", margin=0.05), + Benchmark.Criterion(name="optimize/AP_2d@0.5", summary="max", compare=">", margin=0.1), Benchmark.Criterion(name="train/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="test/iter_time", summary="mean", compare="<", margin=0.1), Benchmark.Criterion(name="export/iter_time", summary="mean", compare="<", margin=0.1), diff --git a/src/otx/core/data/dataset/utils/__init__.py b/tests/unit/algo/object_detection_3d/__init__.py similarity index 64% rename from src/otx/core/data/dataset/utils/__init__.py rename to tests/unit/algo/object_detection_3d/__init__.py index 0c75fd7a904..189d63933b4 100644 --- a/src/otx/core/data/dataset/utils/__init__.py +++ b/tests/unit/algo/object_detection_3d/__init__.py @@ -1,4 +1,4 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # -"""Module defines utils for OTXDatasets.""" +"""Test of OTX Object Detection 3D task.""" diff --git a/tests/unit/algo/object_detection_3d/backbones/__init__.py b/tests/unit/algo/object_detection_3d/backbones/__init__.py new file mode 100644 index 00000000000..a9de1fff0dc --- /dev/null +++ b/tests/unit/algo/object_detection_3d/backbones/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of OTX Object Detection 3D backbones.""" diff --git a/tests/unit/algo/object_detection_3d/backbones/test_monodetr_resnet.py b/tests/unit/algo/object_detection_3d/backbones/test_monodetr_resnet.py new file mode 100644 index 00000000000..39975fe3bf9 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/backbones/test_monodetr_resnet.py @@ -0,0 +1,78 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Tests for MonoDetr backbone.""" +import pytest +import torch +from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBase, Joiner, PositionEmbeddingSine +from otx.algo.object_detection_3d.utils.utils import NestedTensor + + +class TestBackbone: + @pytest.fixture() + def backbone(self, mocker): + mocker.patch("otx.algo.object_detection_3d.backbones.monodetr_resnet.IntermediateLayerGetter") + model = BackboneBase(backbone=mocker.MagicMock(torch.nn.Module), train_backbone=True, return_interm_layers=True) + model.body = mocker.MagicMock(return_value={"layer_0": torch.rand(1, 3, 256, 224)}) + return model + + def test_backbone_forward(self, backbone): + images = torch.randn(1, 3, 224, 224) + output = backbone(images) + assert isinstance(output, dict) + assert len(output) == 1 + assert all(isinstance(value, NestedTensor) for value in output.values()) + + def test_position_embedding_sine(self): + # Create a PositionEmbeddingSine instance + position_embedding = PositionEmbeddingSine(num_pos_feats=128, temperature=10000, normalize=False, scale=None) + + # Create a dummy input tensor + tensor_list = torch.randn(1, 512, 48, 160) + nested_tensor = NestedTensor(tensor_list, mask=torch.ones(1, 48, 160).bool()) + + # Forward pass + output = position_embedding(nested_tensor) + + # Check output shape + assert output.shape == (1, 256, 48, 160) + # Check output type + assert output.dtype == torch.float32 + # Check sine and cosine properties + assert torch.allclose( + output[:, :, :, :80].sin().pow(2) + output[:, :, :, 80:].cos().pow(2), + torch.ones(1, 256, 48, 80), + ) + + +class TestJoiner: + @pytest.fixture() + def joiner(self, mocker): + mocker.patch("otx.algo.object_detection_3d.backbones.monodetr_resnet.Backbone") + mocker.patch("otx.algo.object_detection_3d.backbones.monodetr_resnet.PositionEmbeddingSine") + backbone = mocker.MagicMock(torch.nn.Module) + backbone.strides = [4, 8, 16] + backbone.num_channels = [32, 64, 128] + position_embedding = mocker.MagicMock(torch.nn.Module) + return Joiner(backbone=backbone, position_embedding=position_embedding) + + def test_joiner_forward(self, joiner): + images = torch.randn(1, 3, 224, 224) + nested_tensors = [NestedTensor(torch.randn(1, 256, 56, 56), torch.ones(1, 56, 56).bool())] + position_embeddings = [torch.randn(1, 256, 56, 56)] + joiner[0].return_value = {0: nested_tensors[0]} + joiner[1].return_value = position_embeddings[0] + + output_tensors, output_position_embeddings = joiner(images) + + assert isinstance(output_tensors, list) + assert isinstance(output_position_embeddings, list) + assert len(output_tensors) == 1 + assert len(output_position_embeddings) == 1 + assert isinstance(output_tensors[0], NestedTensor) + assert isinstance(output_position_embeddings[0], torch.Tensor) + assert output_tensors[0].tensors.shape == (1, 256, 56, 56) + assert output_tensors[0].mask.shape == (1, 56, 56) + assert output_position_embeddings[0].shape == (1, 256, 56, 56) + assert output_tensors[0].tensors.dtype == images.dtype + assert output_position_embeddings[0].dtype == images.dtype diff --git a/tests/unit/algo/object_detection_3d/conftest.py b/tests/unit/algo/object_detection_3d/conftest.py new file mode 100644 index 00000000000..81e3326bfae --- /dev/null +++ b/tests/unit/algo/object_detection_3d/conftest.py @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of custom algo modules of OTX Object Detection 3D task.""" + +import pytest +import torch +from otx.core.config.data import SubsetConfig +from otx.core.data.module import OTXDataModule +from otx.core.data.transform_libs.torchvision import Decode3DInputsAffineTransforms +from otx.core.types.task import OTXTaskType +from torchvision.transforms.v2 import Normalize, ToDtype + + +@pytest.fixture() +def fxt_data_module_3d(): + return OTXDataModule( + task=OTXTaskType.OBJECT_DETECTION_3D, + data_format="kitti3d", + data_root="tests/assets/kitti3d", + train_subset=SubsetConfig( + batch_size=2, + subset_name="train", + transforms=[ + Decode3DInputsAffineTransforms((380, 1280), True), + ToDtype(torch.float), + Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), + ], + to_tv_image=False, + ), + val_subset=SubsetConfig( + batch_size=2, + subset_name="val", + transforms=[ + Decode3DInputsAffineTransforms((380, 1280), decode_annotations=False), + ToDtype(torch.float), + Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), + ], + to_tv_image=False, + ), + test_subset=SubsetConfig( + batch_size=2, + subset_name="test", + transforms=[ + Decode3DInputsAffineTransforms((380, 1280), decode_annotations=False), + ToDtype(torch.float), + Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), + ], + to_tv_image=False, + ), + ) diff --git a/tests/unit/algo/object_detection_3d/detectors/__init__.py b/tests/unit/algo/object_detection_3d/detectors/__init__.py new file mode 100644 index 00000000000..fe121ff5cc0 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/detectors/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of OTX Object Detection 3D detectors.""" diff --git a/tests/unit/algo/object_detection_3d/detectors/test_monodetr.py b/tests/unit/algo/object_detection_3d/detectors/test_monodetr.py new file mode 100644 index 00000000000..4dd27c1bca3 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/detectors/test_monodetr.py @@ -0,0 +1,71 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test MonoDetr.""" + +import pytest +import torch +from otx.algo.object_detection_3d.backbones.monodetr_resnet import BackboneBuilder +from otx.algo.object_detection_3d.detectors.monodetr import MonoDETR +from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor +from otx.algo.object_detection_3d.heads.depthaware_transformer import DepthAwareTransformerBuilder + + +class TestMonoDETR: + @pytest.fixture() + def model(self): + backbone = BackboneBuilder("monodetr_50") + # transformer + depthaware_transformer = DepthAwareTransformerBuilder("monodetr_50") + # depth prediction module + depth_predictor = DepthPredictor(depth_num_bins=80, depth_min=1e-3, depth_max=60.0, hidden_dim=256) + + num_classes = 2 + num_queries = 50 + num_feature_levels = 4 + return MonoDETR( + backbone, + depthaware_transformer, + depth_predictor, + num_classes=num_classes, + num_queries=num_queries, + num_feature_levels=num_feature_levels, + with_box_refine=True, + ) + + def test_monodetr_forward(self, model): + # Create a sample input + images = torch.randn(2, 3, 224, 224) + calibs = torch.randn(2, 3, 4) + img_sizes = torch.tensor([[224, 224], [224, 224]]) + # Perform forward pass + output = model(images, calibs, img_sizes, mode="predict") + + # Check the output + assert "scores" in output + assert "boxes_3d" in output + assert "size_3d" in output + assert "depth" in output + assert "heading_angle" in output + assert "pred_depth_map_logits" in output + assert "aux_outputs" in output + + # Check the shape of the output tensors + assert output["scores"].shape == (2, 550, 2) + assert output["boxes_3d"].shape == (2, 550, 6) + assert output["size_3d"].shape == (2, 550, 3) + assert output["depth"].shape == (2, 550, 2) + assert output["heading_angle"].shape == (2, 550, 24) + assert output["pred_depth_map_logits"].shape == (2, 81, 14, 14) + + # Check error handling when loss is None + with pytest.raises(ValueError): # noqa: PT011 + output = model(images, calibs, img_sizes, mode="loss") + + # Check the export mode + export_output = model(images, calibs, img_sizes, mode="export") + assert "scores" in export_output + assert "boxes_3d" in export_output + assert export_output["scores"].shape == (2, 550, 2) + assert export_output["scores"].min() >= 0 + assert export_output["scores"].max() <= 1 diff --git a/tests/unit/algo/object_detection_3d/heads/__init__.py b/tests/unit/algo/object_detection_3d/heads/__init__.py new file mode 100644 index 00000000000..fa4f5a8e834 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/heads/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of OTX Object Detection 3D heads.""" diff --git a/tests/unit/algo/object_detection_3d/heads/test_depth_predictor.py b/tests/unit/algo/object_detection_3d/heads/test_depth_predictor.py new file mode 100644 index 00000000000..5341e705b96 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/heads/test_depth_predictor.py @@ -0,0 +1,43 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test DepthPredictor.""" + +import pytest +import torch +from otx.algo.object_detection_3d.heads.depth_predictor import DepthPredictor + + +class TestDepthPredictor: + @pytest.fixture() + def depth_predictor(self): + return DepthPredictor(depth_num_bins=10, depth_min=0.0, depth_max=1.0, hidden_dim=256) + + def test_depth_predictor_forward(self, depth_predictor): + feature = [ + torch.randn(1, 256, 48, 160), + torch.randn(1, 256, 24, 80), + torch.randn(1, 256, 12, 40), + torch.randn(1, 256, 6, 20), + ] + mask = torch.randn(1, 24, 80) + pos = torch.randn(1, 256, 24, 80) + + depth_logits, depth_embed, weighted_depth, depth_pos_embed_ip = depth_predictor(feature, mask, pos) + + assert depth_logits.shape == (1, 11, 24, 80) + assert depth_embed.shape == (1, 256, 24, 80) + assert weighted_depth.shape == (1, 24, 80) + assert depth_pos_embed_ip.shape == (1, 256, 24, 80) + + def test_depth_predictor_interpolate_depth_embed(self, depth_predictor): + depth = torch.randn(1, 8, 8) + interpolated_depth_embed = depth_predictor.interpolate_depth_embed(depth) + + assert interpolated_depth_embed.shape == (1, 256, 8, 8) + + def test_depth_predictor_interpolate_1d(self, depth_predictor): + coord = torch.randn(1, 8, 8).clamp(min=0, max=1) + interpolated_embeddings = depth_predictor.interpolate_1d(coord, depth_predictor.depth_pos_embed) + + assert interpolated_embeddings.shape == (1, 8, 8, 256) diff --git a/tests/unit/algo/object_detection_3d/heads/test_depthaware_transformer.py b/tests/unit/algo/object_detection_3d/heads/test_depthaware_transformer.py new file mode 100644 index 00000000000..ddc6c234fac --- /dev/null +++ b/tests/unit/algo/object_detection_3d/heads/test_depthaware_transformer.py @@ -0,0 +1,67 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""test depth aware transformer head for 3d object detection.""" + +import pytest +import torch +from otx.algo.object_detection_3d.heads.depthaware_transformer import ( + DepthAwareTransformerBuilder, +) + + +class TestDepthAwareTransformer: + @pytest.fixture() + def depth_aware_transformer(self): + return DepthAwareTransformerBuilder("monodetr_50") + + def test_depth_aware_transformer_forward(self, depth_aware_transformer): + # Create dummy input tensors + srcs = [ + torch.randn(1, 256, 48, 160), + torch.randn(1, 256, 24, 80), + torch.randn(1, 256, 12, 40), + torch.randn(1, 256, 6, 20), + ] + masks = [ + torch.randn(1, 48, 160) < 0, + torch.randn(1, 24, 80) < 0, + torch.randn(1, 12, 40) < 0, + torch.randn(1, 6, 20) < 0, + ] + pos_embeds = [ + torch.randn(1, 256, 48, 160), + torch.randn(1, 256, 24, 80), + torch.randn(1, 256, 12, 40), + torch.randn(1, 256, 6, 20), + ] + query_embed = torch.randn(550, 512) + depth_pos_embed = torch.randn(1, 256, 24, 80) + depth_pos_embed_ip = torch.randn(1, 256, 24, 80) + attn_mask = None + depth_aware_transformer.decoder.return_intermediate = False + output = depth_aware_transformer.forward( + srcs, + masks, + pos_embeds, + query_embed, + depth_pos_embed, + depth_pos_embed_ip, + attn_mask, + ) + + # Check output shape + assert len(output) == 6 + assert output[0].shape == (1, 550, 256) + assert output[2].shape == (1, 550, 2) + assert output[4] is None + + def test_depth_aware_transformer_get_valid_ratio(self, depth_aware_transformer): + # Create dummy input tensor + mask = torch.randn(2, 32, 32) > 0 + + # Get valid ratio + valid_ratio = depth_aware_transformer.get_valid_ratio(mask) + + # Check output shape + assert valid_ratio.shape == (2, 2) diff --git a/tests/unit/algo/object_detection_3d/losses/__init__.py b/tests/unit/algo/object_detection_3d/losses/__init__.py new file mode 100644 index 00000000000..317723ad150 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/losses/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of OTX Object Detection 3D losses.""" diff --git a/tests/unit/algo/object_detection_3d/losses/test_monodetr_loss.py b/tests/unit/algo/object_detection_3d/losses/test_monodetr_loss.py new file mode 100644 index 00000000000..f041445619a --- /dev/null +++ b/tests/unit/algo/object_detection_3d/losses/test_monodetr_loss.py @@ -0,0 +1,101 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Unit test for MonoDETR loss.""" +import torch +from otx.algo.object_detection_3d.losses.monodetr_loss import MonoDETRCriterion + + +class TestMonoDETRCriterion: + def test_loss_labels(self): + criterion = MonoDETRCriterion(num_classes=10, weight_dict={}, focal_alpha=0.5) + outputs = { + "scores": torch.randn(2, 10, 10), + } + targets = [ + {"labels": torch.tensor([1, 2, 0, 0, 0, 0, 1, 2, 0, 0])}, + {"labels": torch.tensor([3, 4, 0, 0, 0, 0, 3, 4, 0, 0])}, + ] + indices = [ + (torch.tensor([0, 1]), torch.tensor([0, 1])), + (torch.tensor([0, 1]), torch.tensor([0, 1])), + ] + num_boxes = 4 + + loss = criterion.loss_labels(outputs, targets, indices, num_boxes) + assert "loss_ce" in loss + assert isinstance(loss["loss_ce"], torch.Tensor) + + def test_loss_3dcenter(self): + criterion = MonoDETRCriterion(num_classes=10, weight_dict={}, focal_alpha=0.5) + outputs = { + "boxes_3d": torch.randn(2, 10, 4), + } + targets = [ + {"boxes_3d": torch.tensor([[1, 2], [3, 4]])}, + {"boxes_3d": torch.tensor([[5, 6], [7, 8]])}, + ] + indices = [ + (torch.tensor([0, 1]), torch.tensor([0, 1])), + (torch.tensor([0, 1]), torch.tensor([0, 1])), + ] + num_boxes = 4 + + loss = criterion.loss_3dcenter(outputs, targets, indices, num_boxes) + assert "loss_center" in loss + assert isinstance(loss["loss_center"], torch.Tensor) + + def test_forward(self): + criterion = MonoDETRCriterion(num_classes=10, weight_dict={}, focal_alpha=0.5) + outputs = { + "scores": torch.randn(1, 100, 10), + "boxes_3d": torch.randn(1, 100, 6), + "depth": torch.randn(1, 100, 2), + "size_3d": torch.randn(1, 100, 3), + "heading_angle": torch.randn(1, 100, 24), + "pred_depth_map_logits": torch.randn(1, 100, 80, 80), + } + targets = [ + { + "labels": torch.tensor([0, 0, 0, 0]), + "boxes": torch.tensor( + [ + [0.7697, 0.4923, 0.0398, 0.0663], + [0.7371, 0.4857, 0.0339, 0.0620], + [0.7126, 0.4850, 0.0246, 0.0501], + [0.5077, 0.5280, 0.0444, 0.1475], + ], + ), + "depth": torch.tensor([[47.5800], [55.2600], [62.3900], [23.7700]]), + "size_3d": torch.tensor( + [ + [1.5500, 1.3700, 3.9700], + [1.6900, 1.7400, 3.7600], + [1.5500, 1.3900, 3.5500], + [1.6200, 1.6300, 4.5000], + ], + ), + "heading_angle": torch.tensor( + [ + [2.0000e00, 4.6737e-02], + [8.0000e00, 1.2180e-01], + [8.0000e00, 1.5801e-01], + [9.0000e00, 1.8260e-04], + ], + ), + "boxes_3d": torch.tensor( + [ + [0.7689, 0.4918, 0.0191, 0.0208, 0.0327, 0.0336], + [0.7365, 0.4858, 0.0163, 0.0175, 0.0310, 0.0310], + [0.7122, 0.4848, 0.0118, 0.0127, 0.0248, 0.0252], + [0.5089, 0.5234, 0.0235, 0.0209, 0.0693, 0.0783], + ], + ), + }, + ] + + losses = criterion.forward(outputs, targets) + assert isinstance(losses, dict) + assert len(losses) == 8 + for loss in losses.values(): + assert isinstance(loss, torch.Tensor) diff --git a/tests/unit/algo/object_detection_3d/matchers/__init__.py b/tests/unit/algo/object_detection_3d/matchers/__init__.py new file mode 100644 index 00000000000..9df782e1015 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/matchers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test of OTX Object Detection 3D matchers.""" diff --git a/tests/unit/algo/object_detection_3d/matchers/test_matcher_3d.py b/tests/unit/algo/object_detection_3d/matchers/test_matcher_3d.py new file mode 100644 index 00000000000..c2173bd411a --- /dev/null +++ b/tests/unit/algo/object_detection_3d/matchers/test_matcher_3d.py @@ -0,0 +1,51 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Test for HungarianMatcher3D module.""" + +import pytest +import torch +from otx.algo.object_detection_3d.matchers.matcher_3d import HungarianMatcher3D + + +class TestHungarianMatcher3D: + @pytest.fixture() + def matcher(self): + return HungarianMatcher3D() + + def test_hungarian_matcher_3d(self, matcher): + outputs = { + "scores": torch.randn(1, 100, 10), + "boxes_3d": torch.randn(1, 100, 6), + } + targets = [ + { + "labels": torch.tensor([0, 0, 0, 0]), + "boxes": torch.tensor( + [ + [0.7697, 0.4923, 0.0398, 0.0663], + [0.7371, 0.4857, 0.0339, 0.0620], + [0.7126, 0.4850, 0.0246, 0.0501], + [0.5077, 0.5280, 0.0444, 0.1475], + ], + ), + "boxes_3d": torch.tensor( + [ + [0.7689, 0.4918, 0.0191, 0.0208, 0.0327, 0.0336], + [0.7365, 0.4858, 0.0163, 0.0175, 0.0310, 0.0310], + [0.7122, 0.4848, 0.0118, 0.0127, 0.0248, 0.0252], + [0.5089, 0.5234, 0.0235, 0.0209, 0.0693, 0.0783], + ], + ), + }, + ] + group_num = 11 + + result = matcher(outputs, targets, group_num) + + assert len(result) == 1 + assert isinstance(result[0][0], torch.Tensor) + assert isinstance(result[0][1], torch.Tensor) + assert len(result[0][0].tolist()) == 44 + assert len(result[0][1].tolist()) == 44 + assert torch.max(torch.stack(result[0])) <= 100 diff --git a/tests/unit/algo/object_detection_3d/test_monodetr3d.py b/tests/unit/algo/object_detection_3d/test_monodetr3d.py new file mode 100644 index 00000000000..6e9ce559895 --- /dev/null +++ b/tests/unit/algo/object_detection_3d/test_monodetr3d.py @@ -0,0 +1,48 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Test of OTX MonoDETR3D architecture.""" + +import pytest +import torch +from otx.algo.object_detection_3d.monodetr3d import MonoDETR3D +from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity +from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter +from otx.core.types.export import TaskLevelExportParameters + + +class TestMonoDETR3D: + @pytest.fixture() + def model(self): + return MonoDETR3D(model_name="monodetr_50", label_info=2, input_size=(1280, 384)) + + def test_init(self, model) -> None: + assert isinstance(model._export_parameters, TaskLevelExportParameters) + assert isinstance(model._exporter, OTXObjectDetection3DExporter) + + def test_loss(self, model, fxt_data_module_3d): + data = next(iter(fxt_data_module_3d.train_dataloader())) + output = model(data) + assert "loss_ce" in output + assert "loss_bbox" in output + assert "loss_center" in output + assert "loss_center_aux_1" in output + for loss in output.values(): + assert loss is not None + assert isinstance(loss, torch.Tensor) + + def test_predict(self, model, fxt_data_module_3d): + data = next(iter(fxt_data_module_3d.train_dataloader())) + model.eval() + output = model(data) + assert isinstance(output, Det3DBatchDataEntity) + + def test_export(self, model): + model.eval() + output = model.forward_for_tracing( + torch.randn(1, 3, 384, 1280), + torch.randn(1, 3, 4), + torch.tensor([[1280, 384]]), + ) + assert isinstance(output, dict) + assert len(output) == 5 + assert list(output.keys()) == ["scores", "boxes_3d", "size_3d", "depth", "heading_angle"] diff --git a/tests/unit/algo/segmentation/segmentors/test_base_model.py b/tests/unit/algo/segmentation/segmentors/test_base_model.py index d970ead0c32..30893e5182e 100644 --- a/tests/unit/algo/segmentation/segmentors/test_base_model.py +++ b/tests/unit/algo/segmentation/segmentors/test_base_model.py @@ -43,8 +43,10 @@ def test_forward_returns_prediction(self, model, inputs): def test_extract_features(self, model, inputs): images = inputs[0] features = model.extract_features(images) - assert isinstance(features, torch.Tensor) - assert features.shape == (1, 2, 256, 256) + assert isinstance(features, tuple) + assert isinstance(features[0], torch.Tensor) + assert isinstance(features[1], torch.Tensor) + assert features[1].shape == (1, 2, 256, 256) def test_calculate_loss(self, model, inputs): model.criterion.name = "CrossEntropyLoss" diff --git a/tests/unit/core/data/dataset/test_keypoint_detection.py b/tests/unit/core/data/dataset/test_keypoint_detection.py index 6bc19469d37..87fe62b27d4 100644 --- a/tests/unit/core/data/dataset/test_keypoint_detection.py +++ b/tests/unit/core/data/dataset/test_keypoint_detection.py @@ -19,7 +19,7 @@ def fxt_dm_dataset(self) -> DmDataset: return DmDataset.import_from("tests/assets/car_tree_bug_keypoint", format="coco_person_keypoints") @pytest.fixture() - def fxt_tvt_transforms(self, mocker) -> Identity: + def fxt_tvt_transforms(self) -> Identity: return Identity() @pytest.mark.parametrize("subset", ["train", "val"]) diff --git a/tests/unit/core/data/dataset/test_object_detection_3d.py b/tests/unit/core/data/dataset/test_object_detection_3d.py new file mode 100644 index 00000000000..f75004a45d4 --- /dev/null +++ b/tests/unit/core/data/dataset/test_object_detection_3d.py @@ -0,0 +1,58 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Unit tests of Object Detection 3D datasets.""" + +from __future__ import annotations + +import numpy as np +import pytest +from datumaro import Dataset as DmDataset +from otx.core.data.dataset.object_detection_3d import OTX3DObjectDetectionDataset +from otx.core.data.entity.base import ImageInfo +from torchvision.transforms.v2 import Identity, Transform + + +class TestOTXObjectDetection3DDataset: + @pytest.fixture() + def fxt_dm_dataset(self) -> DmDataset: + return DmDataset.import_from("tests/assets/kitti3d", format="kitti3d") + + @pytest.fixture() + def fxt_tvt_transforms(self) -> Identity: + return Identity() + + @pytest.mark.parametrize("subset", ["train", "val"]) + def test_get_item_impl_subset( + self, + fxt_dm_dataset, + fxt_tvt_transforms: Transform, + subset: str, + ) -> None: + dataset = OTX3DObjectDetectionDataset( + fxt_dm_dataset.get_subset(subset).as_dataset(), + fxt_tvt_transforms, + ) + + entity = dataset._get_item_impl(0) + + assert hasattr(entity, "image") + assert isinstance(entity.image, np.ndarray) + assert hasattr(entity, "img_info") + assert isinstance(entity.img_info, ImageInfo) + assert hasattr(entity, "calib_matrix") + assert isinstance(entity.calib_matrix, np.ndarray) + assert hasattr(entity, "boxes_3d") + assert isinstance(entity.boxes_3d, np.ndarray) + assert hasattr(entity, "boxes") + assert isinstance(entity.boxes, np.ndarray) + assert hasattr(entity, "size_2d") + assert isinstance(entity.boxes_3d, np.ndarray) + assert hasattr(entity, "size_3d") + assert isinstance(entity.boxes_3d, np.ndarray) + assert hasattr(entity, "heading_angle") + assert isinstance(entity.boxes_3d, np.ndarray) + assert hasattr(entity, "depth") + assert isinstance(entity.boxes_3d, np.ndarray) + assert hasattr(entity, "original_kitti_format") + assert isinstance(entity.original_kitti_format, dict) diff --git a/tests/unit/core/data/dataset/test_segmentation.py b/tests/unit/core/data/dataset/test_segmentation.py index 141dc4bf74b..c7e35d0a924 100644 --- a/tests/unit/core/data/dataset/test_segmentation.py +++ b/tests/unit/core/data/dataset/test_segmentation.py @@ -19,7 +19,7 @@ def test_get_item( max_refetch=3, ) assert isinstance(dataset[0], SegDataEntity) - assert "background" in [label_name.lower() for label_name in dataset.label_info.label_names] + assert "otx_background_lbl" in [label_name.lower() for label_name in dataset.label_info.label_names] def test_get_item_from_bbox_dataset( self, @@ -33,4 +33,4 @@ def test_get_item_from_bbox_dataset( ) assert isinstance(dataset[0], SegDataEntity) # OTXSegmentationDataset should add background when getting a dataset which includes only bbox annotations - assert "background" in [label_name.lower() for label_name in dataset.label_info.label_names] + assert "otx_background_lbl" in [label_name.lower() for label_name in dataset.label_info.label_names] diff --git a/tests/unit/core/data/transform_libs/test_torchvision.py b/tests/unit/core/data/transform_libs/test_torchvision.py index 1a1363d6821..3aad061118e 100644 --- a/tests/unit/core/data/transform_libs/test_torchvision.py +++ b/tests/unit/core/data/transform_libs/test_torchvision.py @@ -17,10 +17,12 @@ from otx.core.data.entity.detection import DetBatchDataEntity, DetDataEntity from otx.core.data.entity.instance_segmentation import InstanceSegBatchDataEntity, InstanceSegDataEntity from otx.core.data.entity.keypoint_detection import KeypointDetDataEntity +from otx.core.data.entity.object_detection_3d import Det3DDataEntity from otx.core.data.transform_libs.torchvision import ( CachedMixUp, CachedMosaic, Compose, + Decode3DInputsAffineTransforms, DecodeVideo, FilterAnnotations, GetBBoxCenterScale, @@ -918,3 +920,149 @@ def test_forward(self, keypoint_det_entity) -> None: assert np.array_equal(results.bbox_info.center, np.array([3.5, 3.5])) assert np.array_equal(results.bbox_info.scale, np.array([8.75, 8.75])) assert results.keypoints.shape == (4, 2) + + +class TestDecode3DInputsAffineTransforms: + @pytest.fixture() + def decode_transform(self) -> Decode3DInputsAffineTransforms: + return Decode3DInputsAffineTransforms(input_size=(380, 1280), decode_annotations=True) + + @pytest.fixture() + def original_kitti_format(self) -> dict[str, np.array]: + return { + "name": np.array([0]), + "alpha": np.array([1.55]), + "bbox": np.array([[614.23999023, 181.77999878, 727.30999756, 284.76998901]]), + "dimensions": np.array([[4.15, 1.57, 1.73]]), + "location": np.array([[1.0, 1.75, 13.22]]), + "rotation_y": np.array([1.62]), + "occluded": np.array([0]), + "truncated": np.array([0.0]), + } + + @pytest.fixture() + def det_3d_data_entity(self, original_kitti_format) -> Det3DDataEntity: + return Det3DDataEntity( + image=np.random.rand(725, 1920, 3), + img_info=ImageInfo( + img_idx=0, + img_shape=(380, 1280), + ori_shape=(725, 1920), + image_color_channel=True, + ignored_labels=[], + ), + boxes=np.zeros((50, 4), dtype=np.float32), + labels=np.zeros((50), dtype=np.int8), + calib_matrix=np.array( + [ + [721.5377, 0.0, 609.5593, 44.85728], + [0.0, 721.5377, 172.854, 0.2163791], + [0.0, 0.0, 1.0, 0.002745884], + ], + ), + boxes_3d=np.zeros((50, 6), dtype=np.float32), + size_2d=np.zeros((50, 2), dtype=np.float32), + size_3d=np.zeros((50, 3), dtype=np.float32), + depth=np.zeros((50, 1), dtype=np.float32), + heading_angle=np.zeros((50, 2), dtype=np.float32), + original_kitti_format=deepcopy(original_kitti_format), + ) + + def test_general_call( + self, + decode_transform: Decode3DInputsAffineTransforms, + det_3d_data_entity: Det3DDataEntity, + original_kitti_format: dict[str, np.array], + ) -> None: + """Test __call__.""" + results = decode_transform(det_3d_data_entity) + + assert results.image.shape == (3, 380, 1280) + assert results.labels.dtype == torch.long + for key in ["boxes", "boxes_3d", "size_2d", "size_3d", "depth", "heading_angle"]: + assert hasattr(results, key) + assert getattr(results, key).size()[0] == 1 # only one object + if key != "boxes": + assert isinstance(getattr(results, key), torch.Tensor) + assert getattr(results, key).dtype == torch.float32 + else: + assert isinstance(getattr(results, key), tv_tensors.BoundingBoxes) + + assert results.boxes.format == tv_tensors.BoundingBoxFormat.XYXY + assert results.boxes_3d.shape == (1, 6) + assert results.calib_matrix.shape == (3, 4) + # dimensions are in the right position and differ from original_kitti_format + assert original_kitti_format["dimensions"][0, 0] == results.size_3d[0, 2] + + def test_no_decode_annotations( + self, + decode_transform: Decode3DInputsAffineTransforms, + det_3d_data_entity: Det3DDataEntity, + mocker, + ) -> None: + """Test __call__.""" + decode_transform.decode_annotations = False + results = decode_transform(det_3d_data_entity) + + assert results.image.shape == (3, 380, 1280) + assert isinstance(results.image, torch.Tensor) + for key in ["boxes", "boxes_3d", "size_2d", "size_3d", "depth", "heading_angle"]: + assert hasattr(results, key) + assert getattr(results, key).size()[0] == 0 # all annotations filtered + if key != "boxes": + assert isinstance(getattr(results, key), torch.Tensor) + else: + assert isinstance(getattr(results, key), tv_tensors.BoundingBoxes) + assert results.calib_matrix.shape == (3, 4) + assert isinstance(results.calib_matrix, torch.Tensor) + + def test_no_input_size( + self, + decode_transform: Decode3DInputsAffineTransforms, + det_3d_data_entity: Det3DDataEntity, + mocker, + ) -> None: + # no resize and affine transforms + decode_transform.input_size = None + decode_transform._affine_transforms = mocker.MagicMock() + results = decode_transform(det_3d_data_entity) + assert results.image.shape == (3, 725, 1920) # no resize + assert isinstance(results.image, torch.Tensor) + assert decode_transform._affine_transforms.call_count == 0 + + def test_affine_transforms(self, decode_transform): + inputs = { + "image": np.random.rand(480, 640, 3), + "ori_shape": np.array([480, 640]), + } + transformed_inputs_0 = decode_transform._affine_transforms(inputs["image"], inputs["ori_shape"], (256, 256)) + + assert transformed_inputs_0[0].shape == (3, 256, 256) + assert transformed_inputs_0[0].dtype == torch.float32 + assert transformed_inputs_0[1] == 1 # no crop + assert transformed_inputs_0[2].shape == (2, 3) + assert isinstance(transformed_inputs_0[3], bool) + assert not transformed_inputs_0[3] + + # test crop + decode_transform.random_crop = True + decode_transform.p_crop = 1.0 + transformed_inputs_1 = decode_transform._affine_transforms(inputs["image"], inputs["ori_shape"], (256, 256)) + + assert transformed_inputs_1[0].shape == (3, 256, 256) + assert transformed_inputs_1[2].shape == (2, 3) + assert np.any(transformed_inputs_1[2] != transformed_inputs_0[2]) + assert transformed_inputs_1[1] != 1 + assert not transformed_inputs_1[3] + + # test flip + decode_transform.random_crop = False + decode_transform.random_horizontal_flip = True + decode_transform.p_flip = 1.0 + transformed_inputs_2 = decode_transform._affine_transforms(inputs["image"], inputs["ori_shape"], (256, 256)) + + assert transformed_inputs_2[0].shape == (3, 256, 256) + assert transformed_inputs_2[2].shape == (2, 3) + assert np.all(transformed_inputs_2[2] == transformed_inputs_0[2]) + assert transformed_inputs_2[1] == 1 # no crop + assert transformed_inputs_2[3] # flip is True diff --git a/tests/unit/core/exporter/test_detection_3d.py b/tests/unit/core/exporter/test_detection_3d.py new file mode 100644 index 00000000000..98564615f61 --- /dev/null +++ b/tests/unit/core/exporter/test_detection_3d.py @@ -0,0 +1,70 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests of visual prompting exporter.""" + +from unittest.mock import MagicMock + +import pytest +import torch +from otx.core.exporter.detection_3d import OTXObjectDetection3DExporter +from otx.core.types.export import OTXExportFormatType + + +class TestOTXVisualPromptingModelExporter: + @pytest.fixture() + def otx_detection_3d_exporter(self) -> OTXObjectDetection3DExporter: + return OTXObjectDetection3DExporter( + task_level_export_parameters=MagicMock(), + input_size=(10, 10), + ) + + def test_export_openvino(self, mocker, tmpdir, otx_detection_3d_exporter) -> None: + """Test export for OPENVINO.""" + mocker_openvino_convert_model = mocker.patch("openvino.convert_model") + mocker_postprocess_openvino_model = mocker.patch.object( + otx_detection_3d_exporter, + "_postprocess_openvino_model", + ) + mocker_openvino_save_model = mocker.patch("openvino.save_model") + mock_model = mocker.MagicMock() + mock_model.parameters.return_value = iter([torch.rand(1, 3)]) + + otx_detection_3d_exporter.export( + model=mock_model, + output_dir=tmpdir, + export_format=OTXExportFormatType.OPENVINO, + ) + + mocker_openvino_convert_model.assert_called() + mocker_postprocess_openvino_model.assert_called() + mocker_openvino_save_model.assert_called() + + with pytest.raises(NotImplementedError): + otx_detection_3d_exporter.export( + model=mock_model, + output_dir=tmpdir, + export_format=OTXExportFormatType.OPENVINO, + to_exportable_code=True, + ) + + def test_export_onnx(self, mocker, tmpdir, otx_detection_3d_exporter) -> None: + """Test export for ONNX.""" + mocker_torch_onnx_export = mocker.patch("torch.onnx.export") + mocker_onnx_load = mocker.patch("onnx.load") + mocker_onnx_save = mocker.patch("onnx.save") + mocker_postprocess_onnx_model = mocker.patch.object( + otx_detection_3d_exporter, + "_postprocess_onnx_model", + ) + mock_model = mocker.MagicMock() + + otx_detection_3d_exporter.export( + model=mock_model, + output_dir=tmpdir, + export_format=OTXExportFormatType.ONNX, + ) + + mocker_torch_onnx_export.assert_called() + mocker_onnx_load.assert_called() + mocker_onnx_save.assert_called() + mocker_postprocess_onnx_model.assert_called() diff --git a/tests/unit/core/metrics/test_accuracy.py b/tests/unit/core/metrics/test_accuracy.py index 8370fee09f6..d3c43a8a087 100644 --- a/tests/unit/core/metrics/test_accuracy.py +++ b/tests/unit/core/metrics/test_accuracy.py @@ -13,7 +13,7 @@ MultilabelAccuracywithLabelGroup, ) from otx.core.types.label import HLabelInfo, LabelInfo -from torchmetrics.classification.accuracy import BinaryAccuracy, MulticlassAccuracy +from torchmetrics.classification.accuracy import BinaryAccuracy, MulticlassAccuracy, MultilabelAccuracy class TestAccuracy: @@ -120,3 +120,28 @@ def test_multilabel_only(self) -> None: head_logits_info={"head1": (0, 5), "head2": (5, 10)}, threshold_multilabel=0.5, ) + + def test_multilabel_accuracy(self, hlabel_accuracy) -> None: + # Normal Case: num_multilabel_classes > 1 -> MultilabelAccuracy + assert hlabel_accuracy.num_multilabel_classes == 3 + assert isinstance(hlabel_accuracy.multilabel_accuracy, MultilabelAccuracy) + + # Edge Case: num_multilabel_classes = 1 -> BinaryAccuracy + acc = MixedHLabelAccuracy( + num_multiclass_heads=2, + num_multilabel_classes=1, + head_logits_info={"head1": (0, 5), "head2": (5, 10)}, + threshold_multilabel=0.5, + ) + assert acc.num_multilabel_classes == 1 + assert isinstance(acc.multilabel_accuracy, BinaryAccuracy) + + # None Case: num_multilabel_classes = 0 -> None + acc = MixedHLabelAccuracy( + num_multiclass_heads=2, + num_multilabel_classes=0, + head_logits_info={"head1": (0, 5), "head2": (5, 10)}, + threshold_multilabel=0.5, + ) + assert acc.num_multilabel_classes == 0 + assert acc.multilabel_accuracy is None diff --git a/tests/unit/core/model/test_detection_3d.py b/tests/unit/core/model/test_detection_3d.py new file mode 100644 index 00000000000..f46dc212b8d --- /dev/null +++ b/tests/unit/core/model/test_detection_3d.py @@ -0,0 +1,127 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +"""Unit tests for keypoint detection model entity.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +import torch +from otx.algo.object_detection_3d.monodetr3d import MonoDETR3D +from otx.core.data.entity.base import OTXBatchLossEntity +from otx.core.data.entity.object_detection_3d import Det3DBatchDataEntity, Det3DBatchPredEntity +from otx.core.metrics.average_precision_3d import KittiMetric +from otx.core.model.base import DefaultOptimizerCallable, DefaultSchedulerCallable +from otx.core.types.label import LabelInfo + +if TYPE_CHECKING: + from otx.core.model.detection_3d import OTX3DDetectionModel + + +class TestOTX3DDetectionModel: + @pytest.fixture() + def model(self, label_info, optimizer, scheduler, metric, torch_compile) -> OTX3DDetectionModel: + return MonoDETR3D(label_info, "monodetr_50", (1280, 384), optimizer, scheduler, metric, torch_compile) + + @pytest.fixture() + def batch_data_entity(self, model) -> Det3DBatchDataEntity: + return model.get_dummy_input(2) + + @pytest.fixture() + def label_info(self) -> LabelInfo: + return LabelInfo( + label_names=["label_0", "label_1"], + label_groups=[["label_0", "label_1"]], + ) + + @pytest.fixture() + def optimizer(self): + return DefaultOptimizerCallable + + @pytest.fixture() + def scheduler(self): + return DefaultSchedulerCallable + + @pytest.fixture() + def metric(self): + return KittiMetric + + @pytest.fixture() + def torch_compile(self): + return False + + def test_export_parameters(self, model): + params = model._export_parameters + assert params.model_type == "mono_3d_det" + assert params.task_type == "3d_detection" + + @pytest.mark.parametrize( + ("label_info", "expected_label_info"), + [ + ( + LabelInfo(label_names=["label1", "label2", "label3"], label_groups=[["label1", "label2", "label3"]]), + LabelInfo(label_names=["label1", "label2", "label3"], label_groups=[["label1", "label2", "label3"]]), + ), + (LabelInfo.from_num_classes(num_classes=5), LabelInfo.from_num_classes(num_classes=5)), + ], + ) + def test_dispatch_label_info(self, model, label_info, expected_label_info): + result = model._dispatch_label_info(label_info) + assert result == expected_label_info + + def test_init(self, model): + assert model.num_classes == 2 + + def test_customize_inputs(self, model, batch_data_entity): + customized_inputs = model._customize_inputs(batch_data_entity) + assert customized_inputs["images"].shape == (2, 3, model.input_size[0], model.input_size[1]) + assert "mode" in customized_inputs + assert "calibs" in customized_inputs + assert customized_inputs["calibs"].shape == (2, 3, 4) + + def test_customize_outputs_training(self, model, batch_data_entity): + outputs = {"loss": torch.tensor(0.5)} + customized_outputs = model._customize_outputs(outputs, batch_data_entity) + assert isinstance(customized_outputs, OTXBatchLossEntity) + assert customized_outputs["loss"] == torch.tensor(0.5) + + def test_customize_outputs_predict(self, model, batch_data_entity): + model.training = False + outputs = { + "scores": torch.randn(2, 50, 2), + "boxes_3d": torch.randn(2, 50, 6), + "boxes": torch.randn(2, 50, 4), + "size_3d": torch.randn(2, 50, 3), + "depth": torch.randn(2, 50, 2), + "heading_angle": torch.randn(2, 50, 24), + } + customized_outputs = model._customize_outputs(outputs, batch_data_entity) + assert isinstance(customized_outputs, Det3DBatchPredEntity) + assert hasattr(customized_outputs, "scores") + assert hasattr(customized_outputs, "heading_angle") + assert hasattr(customized_outputs, "boxes") + assert hasattr(customized_outputs, "size_2d") + assert len(customized_outputs.boxes_3d) == len(customized_outputs.scores) + + def test_dummy_input(self, model: OTX3DDetectionModel): + batch_size = 2 + batch = model.get_dummy_input(batch_size) + assert batch.batch_size == batch_size + + def test_convert_pred_entity_to_compute_metric(self, model: OTX3DDetectionModel, batch_data_entity): + model.training = False + outputs = { + "scores": torch.randn(2, 50, 2), + "boxes_3d": torch.randn(2, 50, 6), + "boxes": torch.randn(2, 50, 4), + "size_3d": torch.randn(2, 50, 3), + "depth": torch.randn(2, 50, 2), + "heading_angle": torch.randn(2, 50, 24), + } + customized_outputs = model._customize_outputs(outputs, batch_data_entity) + converted_pred = model._convert_pred_entity_to_compute_metric(customized_outputs, batch_data_entity) + + assert "preds" in converted_pred + assert "target" in converted_pred diff --git a/tests/unit/core/types/test_label.py b/tests/unit/core/types/test_label.py index 78daec6982e..3ae1ae1f463 100644 --- a/tests/unit/core/types/test_label.py +++ b/tests/unit/core/types/test_label.py @@ -1,7 +1,10 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -from otx.core.types.label import NullLabelInfo, SegLabelInfo +from datumaro import LabelCategories +from datumaro.components.annotation import GroupType +from otx.core.types.label import HLabelInfo, NullLabelInfo, SegLabelInfo def test_as_json(fxt_label_info): @@ -18,3 +21,34 @@ def test_seg_label_info(): ) assert SegLabelInfo.from_num_classes(1) == SegLabelInfo(["background", "label_0"], [["background", "label_0"]]) assert SegLabelInfo.from_num_classes(0) == NullLabelInfo() + + +# Unit test +def test_hlabel_info(): + labels = [ + LabelCategories.Category(name="car", parent="vehicle"), + LabelCategories.Category(name="truck", parent="vehicle"), + LabelCategories.Category(name="plush toy", parent="plush toy"), + LabelCategories.Category(name="No class"), + ] + label_groups = [ + LabelCategories.LabelGroup( + name="Detection labels___vehicle", + labels=["car", "truck"], + group_type=GroupType.EXCLUSIVE, + ), + LabelCategories.LabelGroup( + name="Detection labels___plush toy", + labels=["plush toy"], + group_type=GroupType.EXCLUSIVE, + ), + LabelCategories.LabelGroup(name="No class", labels=["No class"], group_type=GroupType.RESTRICTED), + ] + dm_label_categories = LabelCategories(items=labels, label_groups=label_groups) + + hlabel_info = HLabelInfo.from_dm_label_groups(dm_label_categories) + + # Check if class_to_group_idx and label_to_idx have the same keys + assert list(hlabel_info.class_to_group_idx.keys()) == list( + hlabel_info.label_to_idx.keys(), + ), "class_to_group_idx and label_to_idx keys do not match" diff --git a/tests/unit/engine/adaptive_bs/test_bs_search_algo.py b/tests/unit/engine/adaptive_bs/test_bs_search_algo.py index fde7ceacda2..f59225e3b8a 100644 --- a/tests/unit/engine/adaptive_bs/test_bs_search_algo.py +++ b/tests/unit/engine/adaptive_bs/test_bs_search_algo.py @@ -99,12 +99,19 @@ def test_auto_decrease_batch_size(self): assert adapted_bs == 80 def test_find_max_usable_bs_gpu_memory_too_small(self): - mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1) + mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1) bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000) with pytest.raises(RuntimeError): bs_search_algo.auto_decrease_batch_size() + def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self): + """Batch size 2 doesn't make oom but use most of memory.""" + mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1) + + bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000) + assert bs_search_algo.auto_decrease_batch_size() == 2 + @pytest.mark.parametrize( ("max_runnable_bs", "max_bs", "expected_bs"), [ @@ -126,12 +133,19 @@ def test_find_big_enough_batch_size(self, max_runnable_bs, max_bs, expected_bs): assert adapted_bs == expected_bs def test_find_big_enough_batch_size_gpu_memory_too_small(self): - mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1) + mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1) bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000) with pytest.raises(RuntimeError): bs_search_algo.find_big_enough_batch_size() + def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self): + """Batch size 2 doesn't make oom but use most of memory.""" + mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1) + + bs_search_algo = BsSearchAlgo(mock_train_func, 2, 1000) + assert bs_search_algo.find_big_enough_batch_size() == 2 + def test_find_big_enough_batch_size_gradient_zero(self): def mock_train_func(batch_size) -> int: if batch_size > 1000: diff --git a/tests/unit/engine/hpo/test_hpo_api.py b/tests/unit/engine/hpo/test_hpo_api.py index bcc71d8bc9a..8b24dffcf00 100644 --- a/tests/unit/engine/hpo/test_hpo_api.py +++ b/tests/unit/engine/hpo/test_hpo_api.py @@ -119,7 +119,7 @@ def mock_find_trial_file(mocker) -> MagicMock: @pytest.fixture() def hpo_config() -> HpoConfig: - return HpoConfig(metric_name="val/accuracy") + return HpoConfig(metric_name="val/accuracy", callbacks_to_exclude="UselessCallback") @pytest.fixture() @@ -127,6 +127,19 @@ def mock_progress_update_callback() -> MagicMock: return MagicMock() +class UsefullCallback: + pass + + +class UselessCallback: + pass + + +@pytest.fixture() +def mock_callback() -> list: + return [UsefullCallback(), UselessCallback()] + + def test_execute_hpo( mock_engine: MagicMock, hpo_config: HpoConfig, @@ -138,12 +151,14 @@ def test_execute_hpo( mock_get_best_hpo_weight: MagicMock, mock_find_trial_file: MagicMock, mock_progress_update_callback: MagicMock, + mock_callback: list, ): + hpo_config.progress_update_callback = mock_progress_update_callback best_config, best_hpo_weight = execute_hpo( engine=mock_engine, max_epochs=10, hpo_config=hpo_config, - progress_update_callback=mock_progress_update_callback, + callbacks=mock_callback, ) # check hpo workdir exists @@ -152,12 +167,16 @@ def test_execute_hpo( # check a case where progress_update_callback exists mock_thread.assert_called_once() assert mock_thread.call_args.kwargs["target"] == _update_hpo_progress - assert mock_thread.call_args.kwargs["args"][0] == mock_progress_update_callback assert mock_thread.call_args.kwargs["daemon"] is True mock_thread.return_value.start.assert_called_once() # check whether run_hpo_loop is called well mock_run_hpo_loop.assert_called_once() assert mock_run_hpo_loop.call_args.args[0] == mock_hpo_algo + # check UselessCallback is excluded + for callback in mock_run_hpo_loop.call_args.args[1].keywords["callbacks"]: + assert not isinstance(callback, UselessCallback) + # check origincal callback lists isn't changed. + assert len(mock_callback) == 2 # print_result is called after HPO is done mock_hpo_algo.print_result.assert_called_once() # best_config and best_hpo_weight are returned well diff --git a/tests/unit/engine/test_engine.py b/tests/unit/engine/test_engine.py index 879987f19cc..3adcc5678d7 100644 --- a/tests/unit/engine/test_engine.py +++ b/tests/unit/engine/test_engine.py @@ -223,11 +223,7 @@ def test_exporting(self, fxt_engine, mocker) -> None: checkpoint = "path/to/checkpoint.ckpt" fxt_engine.checkpoint = checkpoint fxt_engine.export() - mock_load_from_checkpoint.assert_called_once_with( - checkpoint_path=checkpoint, - map_location="cpu", - **fxt_engine.model.hparams, - ) + mock_load_from_checkpoint.assert_called_once_with(checkpoint_path=checkpoint, map_location="cpu") mock_export.assert_called_once_with( output_dir=Path(fxt_engine.work_dir), base_name="exported_model",