Recipe refinement (det, iseg, r-det, sseg, vpm, zsl) (#3712)

* Create base data recipe for each task * Remove configs which have default value * Performance check w/ medium dataset for reproducibility
openvinotoolkit · Jul 11, 2024 · 746d07e · 746d07e
1 parent 2adf5b1
commit 746d07e
Show file tree

Hide file tree

Showing 52 changed files with 493 additions and 1,256 deletions.
diff --git a/src/otx/algo/segmentation/dino_v2_seg.py b/src/otx/algo/segmentation/dino_v2_seg.py
@@ -22,7 +22,7 @@ class DinoV2Seg(BaseSegmModel):
     """DinoV2Seg Model."""
 
     default_backbone_configuration: ClassVar[dict[str, Any]] = {
-        "name": "dinov2_vits14_reg",
+        "name": "dinov2_vits14",
         "freeze_backbone": True,
         "out_index": [8, 9, 10, 11],
     }

diff --git a/src/otx/recipe/_base_/data/detection.yaml b/src/otx/recipe/_base_/data/detection.yaml
@@ -0,0 +1,39 @@
+task: DETECTION
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: true
+data_format: coco_instances
+unannotated_items_ratio: 0.0
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  batch_size: 1
+  num_workers: 2
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  batch_size: 1
+  num_workers: 2
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  batch_size: 1
+  num_workers: 2
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/_base_/data/instance_segmentation.yaml b/src/otx/recipe/_base_/data/instance_segmentation.yaml
@@ -0,0 +1,38 @@
+task: INSTANCE_SEGMENTATION
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: true
+data_format: coco_instances
+include_polygons: true
+unannotated_items_ratio: 0.0
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 2
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 2
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 2
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/_base_/data/rotated_detection.yaml b/src/otx/recipe/_base_/data/rotated_detection.yaml
@@ -0,0 +1,38 @@
+task: ROTATED_DETECTION
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: true
+data_format: coco_instances
+include_polygons: true
+unannotated_items_ratio: 0.0
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 2
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 2
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  to_tv_image: false
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 2
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/_base_/data/semantic_segmentation.yaml b/src/otx/recipe/_base_/data/semantic_segmentation.yaml
@@ -0,0 +1,82 @@
+task: SEMANTIC_SEGMENTATION
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+data_format: common_semantic_segmentation_with_subset_dirs
+include_polygons: true
+unannotated_items_ratio: 0.0
+ignore_index: 255
+train_subset:
+  subset_name: train
+  batch_size: 8
+  num_workers: 4
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.RandomResizedCrop
+      init_args:
+        size:
+          - 512
+          - 512
+        scale:
+          - 0.2
+          - 1.0
+        ratio:
+          - 0.5
+          - 2.0
+        antialias: true
+    - class_path: otx.core.data.transform_libs.torchvision.PhotoMetricDistortion
+      init_args:
+        is_numpy_to_tvtensor: true
+    - class_path: torchvision.transforms.v2.RandomHorizontalFlip
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+val_subset:
+  subset_name: val
+  batch_size: 8
+  num_workers: 4
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.Resize
+      init_args:
+        size:
+          - 512
+          - 512
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+test_subset:
+  subset_name: test
+  num_workers: 4
+  batch_size: 8
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.Resize
+      init_args:
+        size:
+          - 512
+          - 512
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/_base_/data/torchvision_base.yaml b/src/otx/recipe/_base_/data/torchvision_base.yaml
@@ -2,13 +2,13 @@ task: MULTI_CLASS_CLS
 mem_cache_size: 1GB
 mem_cache_img_max_size: null
 image_color_channel: RGB
-stack_images: False
+stack_images: false
 data_format: imagenet_with_subset_dirs
 unannotated_items_ratio: 0.0
 train_subset:
   subset_name: train
   transform_lib_type: TORCHVISION
-  to_tv_image: True
+  to_tv_image: true
   transforms:
     - class_path: torchvision.transforms.v2.ToImage
   batch_size: 1
@@ -18,7 +18,7 @@ train_subset:
 val_subset:
   subset_name: val
   transform_lib_type: TORCHVISION
-  to_tv_image: True
+  to_tv_image: true
   transforms:
     - class_path: torchvision.transforms.v2.ToImage
   batch_size: 1
@@ -28,7 +28,7 @@ val_subset:
 test_subset:
   subset_name: test
   transform_lib_type: TORCHVISION
-  to_tv_image: True
+  to_tv_image: true
   transforms:
     - class_path: torchvision.transforms.v2.ToImage
   batch_size: 1

diff --git a/src/otx/recipe/_base_/data/visual_prompting.yaml b/src/otx/recipe/_base_/data/visual_prompting.yaml
@@ -0,0 +1,76 @@
+task: VISUAL_PROMPTING
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: false
+data_format: coco_instances
+unannotated_items_ratio: 0.0
+vpm_config:
+  use_bbox: true
+  use_point: false
+
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge
+      init_args:
+        size: 1024
+        antialias: true
+    - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  batch_size: 2
+  num_workers: 4
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge
+      init_args:
+        size: 1024
+        antialias: true
+    - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  batch_size: 1
+  num_workers: 4
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: otx.core.data.transform_libs.torchvision.ResizetoLongestEdge
+      init_args:
+        size: 1024
+        antialias: true
+    - class_path: otx.core.data.transform_libs.torchvision.PadtoSquare
+    - class_path: torchvision.transforms.v2.ToDtype
+      init_args:
+        dtype: ${as_torch_dtype:torch.float32}
+    - class_path: torchvision.transforms.v2.Normalize
+      init_args:
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+  batch_size: 1
+  num_workers: 4
+  sampler:
+    class_path: torch.utils.data.RandomSampler
diff --git a/src/otx/recipe/_base_/data/zero_shot_visual_prompting.yaml b/src/otx/recipe/_base_/data/zero_shot_visual_prompting.yaml
@@ -0,0 +1,44 @@
+task: ZERO_SHOT_VISUAL_PROMPTING
+mem_cache_size: 1GB
+mem_cache_img_max_size: null
+image_color_channel: RGB
+stack_images: false
+data_format: coco_instances
+unannotated_items_ratio: 0.0
+
+vpm_config:
+  use_bbox: True
+  use_point: False
+
+train_subset:
+  subset_name: train
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 4
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+val_subset:
+  subset_name: val
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 4
+  sampler:
+    class_path: torch.utils.data.RandomSampler
+
+test_subset:
+  subset_name: test
+  transform_lib_type: TORCHVISION
+  to_tv_image: true
+  transforms:
+    - class_path: torchvision.transforms.v2.ToImage
+  batch_size: 1
+  num_workers: 4
+  sampler:
+    class_path: torch.utils.data.RandomSampler