pytorch · datumbox · Sep 21, 2021 · Aug 17, 2021 · Aug 18, 2021 · Aug 18, 2021
diff --git a/gallery/plot_repurposing_annotations.py b/gallery/plot_repurposing_annotations.py
@@ -0,0 +1,73 @@
+"""
+=======================
+Repurposing annotations
+=======================
+
+The following example illustrates the operations available in :ref:`the torchvision.ops module <ops>` for repurposing
-The following example illustrates the operations available in :ref:`the torchvision.ops module <ops>` for repurposing
+The following example illustrates the operations available in the torchvision.ops module for repurposing
-The following example illustrates the operations available in :ref:`the torchvision.ops module <ops>` for repurposing
+The following example illustrates the operations available in the torchvision.ops module for repurposing
+object localization annotations for different tasks (e.g. transforming masks used by instance and panoptic
+segmentation methods into bounding boxes used by object detection methods).
+"""
+
+from PIL import Image
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+
+import torch
+import torchvision.transforms as T
+
+
+plt.rcParams["savefig.bbox"] = 'tight'
+orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+# if you change the seed, make sure that the randomly-applied transforms
+# properly show that the image can be both transformed and *not* transformed!
+torch.manual_seed(0)
+
+
+def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0]) + with_orig
+    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        row = [orig_img] + row if with_orig else row
+        for col_idx, img in enumerate(row):
+            ax = axs[row_idx, col_idx]
+            ax.imshow(np.asarray(img), **imshow_kwargs)
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    if with_orig:
+        axs[0, 0].set(title='Original image')
+        axs[0, 0].title.set_size(8)
+    if row_title is not None:
+        for row_idx in range(num_rows):
+            axs[row_idx, 0].set(ylabel=row_title[row_idx])
+
+    plt.tight_layout()
+
+####################################
+# Masks
+# --------------------------------------
+# In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
+# as a multi-dimensional array (e.g. a NumPy array or a PyTorch tensor) with the following shape:
+#
+#       (objects, height, width)
+#
+# Where objects is the number of annotated objects in the image. Each (height, width) object corresponds to exactly
+# one object. For example, if your input image has the dimensions 224 x 224 and has four annotated objects the shape
+# of your masks annotation has the following shape:
+#
+#       (4, 224, 224).
+#
+# A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
+# localization tasks.
+#
+# Masks to bounding boxes
+# ~~~~~~~~~~~~~~~~~~~~~~~
+# For example, the masks to bounding_boxes operation can be used to transform masks into bounding boxes that can be
+# used in methods like Faster RCNN and YOLO.
+padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
+plot(padded_imgs)
diff --git a/test/assets/labeled_image.png b/test/assets/labeled_image.png
diff --git a/test/assets/masks.tiff b/test/assets/masks.tiff
diff --git a/test/test_masks_to_bounding_boxes.py b/test/test_masks_to_bounding_boxes.py
@@ -0,0 +1,43 @@
+import os.path
+
+import PIL.Image
+import numpy
+import pytest
+import torch
+
+import torchvision.ops
+
+ASSETS_DIRECTORY = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
+
+
+@pytest.fixture
+def labeled_image() -> torch.Tensor:
+    with PIL.Image.open(os.path.join(ASSETS_DIRECTORY, "labeled_image.png")) as image:
+        return torch.tensor(numpy.array(image, numpy.int))
+
+
+@pytest.fixture
+def masks() -> torch.Tensor:
+    with PIL.Image.open(os.path.join(ASSETS_DIRECTORY, "masks.tiff")) as image:
+        frames = numpy.zeros((image.n_frames, image.height, image.width), numpy.int)
+
+        for index in range(image.n_frames):
+            image.seek(index)
+
+            frames[index] = numpy.array(image)
+
+        return torch.tensor(frames)
+
+
+def test_masks_to_bounding_boxes(masks):
+    expected = torch.tensor(
+       [[ 127.,    2.,  165.,   40. ],  # noqa: E121, E201, E202, E241
+        [   4.,  100.,   88.,  184. ],  # noqa:       E201, E202, E241
+        [ 168.,  189.,  294.,  300. ],  # noqa:       E201, E202, E241
+        [ 556.,  272.,  700.,  416. ],  # noqa:       E201, E202, E241
+        [ 800.,  560.,  990.,  725. ],  # noqa:       E201, E202, E241
+        [ 294.,  828.,  594., 1092. ],  # noqa:       E201, E202, E241
+        [ 756., 1036., 1064., 1491. ]]  # noqa:       E201, E202, E241
+    )
+
+    torch.testing.assert_close(torchvision.ops.masks_to_bounding_boxes(masks), expected)
diff --git a/torchvision/ops/__init__.py b/torchvision/ops/__init__.py
@@ -8,6 +8,7 @@
 from .poolers import MultiScaleRoIAlign
 from .feature_pyramid_network import FeaturePyramidNetwork
 from .focal_loss import sigmoid_focal_loss
+from ._masks_to_bounding_boxes import masks_to_bounding_boxes
 
 from ._register_onnx_ops import _register_custom_op
 
@@ -20,5 +21,5 @@
     'box_area', 'box_iou', 'generalized_box_iou', 'roi_align', 'RoIAlign', 'roi_pool',
     'RoIPool', 'ps_roi_align', 'PSRoIAlign', 'ps_roi_pool',
     'PSRoIPool', 'MultiScaleRoIAlign', 'FeaturePyramidNetwork',
-    'sigmoid_focal_loss'
+    'sigmoid_focal_loss', 'masks_to_bounding_boxes'
 ]
diff --git a/torchvision/ops/_masks_to_bounding_boxes.py b/torchvision/ops/_masks_to_bounding_boxes.py
@@ -0,0 +1,26 @@
+import torch
+
+
+def masks_to_bounding_boxes(masks: torch.Tensor) -> torch.Tensor:
+    """Compute the bounding boxes around the provided masks
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = masks * x.unsqueeze(0)
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = masks * y.unsqueeze(0)
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)