From 26db16d6d5902c524afdfe08e62d77e73cffbb03 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 18 May 2021 09:58:52 +0100 Subject: [PATCH 1/3] Added cross-refs, etc. --- docs/source/models.rst | 6 ++++- gallery/plot_visualization_utils.py | 22 +++++++++++++++---- torchvision/models/detection/faster_rcnn.py | 16 +++++++++----- torchvision/models/detection/keypoint_rcnn.py | 8 ++++--- torchvision/models/detection/mask_rcnn.py | 8 ++++--- torchvision/models/detection/retinanet.py | 8 ++++--- 6 files changed, 49 insertions(+), 19 deletions(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index c2b81e49735..56dce6a76dc 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -329,6 +329,8 @@ The images have to be loaded in to a range of ``[0, 1]`` and then normalized usi ``mean = [0.485, 0.456, 0.406]`` and ``std = [0.229, 0.224, 0.225]``. They have been trained on images resized such that their minimum size is 520. +For details on how to plot the masks of such models, you may refer to :ref:`semantic_seg_output`. + The pre-trained models have been trained on a subset of COCO train2017, on the 20 categories that are present in the Pascal VOC dataset. You can see more information on how the subset has been selected in ``references/segmentation/coco_utils.py``. The classes that the pre-trained model outputs are the following, @@ -374,6 +376,7 @@ LR-ASPP .. autofunction:: torchvision.models.segmentation.lraspp_mobilenet_v3_large +.. _object_det_inst_seg_pers_keypoint_det: Object Detection, Instance Segmentation and Person Keypoint Detection ===================================================================== @@ -392,7 +395,8 @@ in torchvision. The models expect a list of ``Tensor[C, H, W]``, in the range ``0-1``. The models internally resize the images but the behaviour varies depending -on the model. Check the constructor of the models for more information. +on the model. Check the constructor of the models for more information. The +output format of such models is illustrated in :ref:`instance_seg_output`. For object detection and instance segmentation, the pre-trained diff --git a/gallery/plot_visualization_utils.py b/gallery/plot_visualization_utils.py index 04c5e3dcb53..b5b3d7e103e 100644 --- a/gallery/plot_visualization_utils.py +++ b/gallery/plot_visualization_utils.py @@ -68,7 +68,8 @@ def show(imgs): # models. Here is demo with a Faster R-CNN model loaded from # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` # model. You can also try using a RetinaNet with -# :func:`~torchvision.models.detection.retinanet_resnet50_fpn`. +# :func:`~torchvision.models.detection.retinanet_resnet50_fpn`. For more details +# on the output of such models, you may refer to :ref:`instance_seg_output`. from torchvision.models.detection import fasterrcnn_resnet50_fpn from torchvision.transforms.functional import convert_image_dtype @@ -87,9 +88,9 @@ def show(imgs): # Let's plot the boxes detected by our model. We will only plot the boxes with a # score greater than a given threshold. -threshold = .8 +score_threshold = .8 dogs_with_boxes = [ - draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > threshold], width=4) + draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > score_threshold], width=4) for dog_int, output in zip(batch_int, outputs) ] show(dogs_with_boxes) @@ -102,6 +103,8 @@ def show(imgs): # segmentation models have different outputs, so we will treat each # independently. # +# .. _semantic_seg_output: +# # Semantic segmentation models # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # @@ -237,6 +240,8 @@ def show(imgs): ##################################### +# .. _instance_seg_output: +# # Instance segmentation models # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # @@ -245,6 +250,15 @@ def show(imgs): # models. Let's start by analyzing the output of a Mask-RCNN model. Note that # these models don't require the images to be normalized, so we don't need to # use the normalized batch. +# +# .. note:: +# +# We will here describe the output of a Mask-RCNN model. The models in +# :ref:`object_det_inst_seg_pers_keypoint_det` all have a similar output +# format, but some of them may have extra info like keypoints for +# :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn`, and some +# of them may not have masks, like +# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`. from torchvision.models.detection import maskrcnn_resnet50_fpn model = maskrcnn_resnet50_fpn(pretrained=True, progress=False) @@ -255,7 +269,7 @@ def show(imgs): ##################################### # Let's break this down. For each image in the batch, the model outputs some -# detections (or instances). The number of detection varies for each input +# detections (or instances). The number of detections varies for each input # image. Each instance is described by its bounding box, its label, its score # and its mask. # diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index 53100e54adb..7137ca3d987 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -317,12 +317,14 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as - follows: + follows, where ``N`` is the number of detections: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - - labels (``Int64Tensor[N]``): the predicted labels for each image - - scores (``Tensor[N]``): the scores or each prediction + - labels (``Int64Tensor[N]``): the predicted labels for each detection + - scores (``Tensor[N]``): the scores of each detection + + For more details on the output, you may refer to :ref:`instance_seg_output`. Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size. @@ -399,7 +401,9 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=False, progress=True, num_c trainable_backbone_layers=None, **kwargs): """ Constructs a low resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone tunned for mobile use-cases. - It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See `fasterrcnn_resnet50_fpn` for more details. + It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See + :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more + details. Example:: @@ -435,7 +439,9 @@ def fasterrcnn_mobilenet_v3_large_fpn(pretrained=False, progress=True, num_class trainable_backbone_layers=None, **kwargs): """ Constructs a high resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone. - It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See `fasterrcnn_resnet50_fpn` for more details. + It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See + :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more + details. Example:: diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py index 0d460ade27c..0d9a4de6dca 100644 --- a/torchvision/models/detection/keypoint_rcnn.py +++ b/torchvision/models/detection/keypoint_rcnn.py @@ -297,14 +297,16 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as - follows: + follows, where ``N`` is the number of detected instances: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - - labels (``Int64Tensor[N]``): the predicted labels for each image - - scores (``Tensor[N]``): the scores or each prediction + - labels (``Int64Tensor[N]``): the predicted labels for each instance + - scores (``Tensor[N]``): the scores or each instance - keypoints (``FloatTensor[N, K, 3]``): the locations of the predicted keypoints, in ``[x, y, v]`` format. + For more details on the output, you may refer to :ref:`instance_seg_output`. + Keypoint R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size. Example:: diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py index 1e6fb77f07a..589a42068bf 100644 --- a/torchvision/models/detection/mask_rcnn.py +++ b/torchvision/models/detection/mask_rcnn.py @@ -289,16 +289,18 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as - follows: + follows, where ``N`` is the number of detected instances: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - - labels (``Int64Tensor[N]``): the predicted labels for each image - - scores (``Tensor[N]``): the scores or each prediction + - labels (``Int64Tensor[N]``): the predicted labels for each instance + - scores (``Tensor[N]``): the scores or each instance - masks (``UInt8Tensor[N, 1, H, W]``): the predicted masks for each instance, in ``0-1`` range. In order to obtain the final segmentation masks, the soft masks can be thresholded, generally with a value of 0.5 (``mask >= 0.5``) + For more details on the output and on how to plot the masks, you may refer to :ref:`instance_seg_output`. + Mask R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size. Example:: diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 43b0d14dd5e..af6943628a4 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -586,12 +586,14 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True, During inference, the model requires only the input tensors, and returns the post-processed predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as - follows: + follows, where ``N`` is the number of detections: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - - labels (``Int64Tensor[N]``): the predicted labels for each image - - scores (``Tensor[N]``): the scores or each prediction + - labels (``Int64Tensor[N]``): the predicted labels for each detection + - scores (``Tensor[N]``): the scores of each detection + + For more details on the output, you may refer to :ref:`instance_seg_output`. Example:: From 9d99ea7154d5bc76edef1f9c7fcd9694eae345bf Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 18 May 2021 10:07:34 +0100 Subject: [PATCH 2/3] copy pasted SSD docs since the class isn't rendered in the html --- torchvision/models/detection/ssd.py | 37 +++++++++++++++++++++---- torchvision/models/detection/ssdlite.py | 5 ++-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py index f6150cf5cd5..6695167d462 100644 --- a/torchvision/models/detection/ssd.py +++ b/torchvision/models/detection/ssd.py @@ -126,11 +126,12 @@ class SSD(nn.Module): During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as - follows: + follows, where ``N`` is the number of detections: + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - - labels (Int64Tensor[N]): the predicted labels for each image - - scores (Tensor[N]): the scores for each prediction + - labels (Int64Tensor[N]): the predicted labels for each detection + - scores (Tensor[N]): the scores for each detection Args: backbone (nn.Module): the network used to compute the features for the model. @@ -520,8 +521,34 @@ def _vgg_extractor(backbone_name: str, highres: bool, progress: bool, pretrained def ssd300_vgg16(pretrained: bool = False, progress: bool = True, num_classes: int = 91, pretrained_backbone: bool = True, trainable_backbone_layers: Optional[int] = None, **kwargs: Any): - """ - Constructs an SSD model with input size 300x300 and a VGG16 backbone. See `SSD` for more details. + """Constructs an SSD model with input size 300x300 and a VGG16 backbone. + + Reference: `"SSD: Single Shot MultiBox Detector" `_. + + The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each + image, and should be in 0-1 range. Different images can have different sizes but they will be resized + to a fixed size before passing it to the backbone. + + The behavior of the model changes depending if it is in training or evaluation mode. + + During training, the model expects both the input tensors, as well as a targets (list of dictionary), + containing: + + - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. + - labels (Int64Tensor[N]): the class label for each ground-truth box + + The model returns a Dict[Tensor] during training, containing the classification and regression + losses. + + During inference, the model requires only the input tensors, and returns the post-processed + predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as + follows, where ``N`` is the number of detections: + + - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with + ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. + - labels (Int64Tensor[N]): the predicted labels for each detection + - scores (Tensor[N]): the scores for each detection Example: diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py index dcab863feb1..56e0d0e4958 100644 --- a/torchvision/models/detection/ssdlite.py +++ b/torchvision/models/detection/ssdlite.py @@ -158,8 +158,9 @@ def ssdlite320_mobilenet_v3_large(pretrained: bool = False, progress: bool = Tru pretrained_backbone: bool = False, trainable_backbone_layers: Optional[int] = None, norm_layer: Optional[Callable[..., nn.Module]] = None, **kwargs: Any): - """ - Constructs an SSDlite model with input size 320x320 and a MobileNetV3 Large backbone. See `SSD` for more details. + """Constructs an SSDlite model with input size 320x320 and a MobileNetV3 Large backbone. + + See :func:`~torchvision.models.detection.ssd300_vgg16` for more details. Example: From 36672e0404c63d4e207f6047faf91bbb3a3dc30a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 18 May 2021 10:10:59 +0100 Subject: [PATCH 3/3] flake8 --- torchvision/models/detection/faster_rcnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index 7137ca3d987..bf8df3b9377 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -401,7 +401,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=False, progress=True, num_c trainable_backbone_layers=None, **kwargs): """ Constructs a low resolution Faster R-CNN model with a MobileNetV3-Large FPN backbone tunned for mobile use-cases. - It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See + It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more details.