Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove the FPN from the RetinaNet with MobileNetV3 backbone #3244

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ Network box AP mask AP keypoint AP
================================ ======= ======== ===========
Faster R-CNN ResNet-50 FPN 37.0 - -
RetinaNet ResNet-50 FPN 36.4 - -
RetinaNet MobileNetV3-Large FPN 25.6 - -
RetinaNet MobileNetV3-Large 22.1 - -
Mask R-CNN ResNet-50 FPN 37.9 34.6 -
================================ ======= ======== ===========

Expand Down Expand Up @@ -420,7 +420,7 @@ Network train time (s / it) test time (s / it) memory
============================== =================== ================== ===========
Faster R-CNN ResNet-50 FPN 0.2288 0.0590 5.2
RetinaNet ResNet-50 FPN 0.2514 0.0939 4.1
RetinaNet MobileNetV3-Large FPN 0.0928 0.0547 1.4
RetinaNet MobileNetV3-Large 0.0873 0.0408 0.9
Mask R-CNN ResNet-50 FPN 0.2728 0.0903 5.4
Keypoint R-CNN ResNet-50 FPN 0.3789 0.1242 6.8
============================== =================== ================== ===========
Expand All @@ -436,7 +436,7 @@ RetinaNet
------------

.. autofunction:: torchvision.models.detection.retinanet_resnet50_fpn
.. autofunction:: torchvision.models.detection.retinanet_mobilenet_v3_large_fpn
.. autofunction:: torchvision.models.detection.retinanet_mobilenet_v3_large


Mask R-CNN
Expand Down
4 changes: 2 additions & 2 deletions references/detection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
```


### RetinaNet with MobileNetV3 Large FPN
### RetinaNet with MobileNetV3 Large
```
python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
--dataset coco --model retinanet_mobilenet_v3_large_fpn --epochs 26 --lr-steps 16 22\
--dataset coco --model retinanet_mobilenet_v3_large --epochs 26 --lr-steps 16 22\
--aspect-ratio-group-factor 3 --lr 0.01
```

Expand Down
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion test/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_available_video_models():
"maskrcnn_resnet50_fpn": lambda x: x[1],
"keypointrcnn_resnet50_fpn": lambda x: x[1],
"retinanet_resnet50_fpn": lambda x: x[1],
"retinanet_mobilenet_v3_large_fpn": lambda x: x[1],
"retinanet_mobilenet_v3_large": lambda x: x[1],
}


Expand Down
2 changes: 1 addition & 1 deletion test/test_models_detection_negative_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_forward_negative_sample_krcnn(self):
self.assertEqual(loss_dict["loss_keypoint"], torch.tensor(0.))

def test_forward_negative_sample_retinanet(self):
for name in ["retinanet_resnet50_fpn", "retinanet_mobilenet_v3_large_fpn"]:
for name in ["retinanet_resnet50_fpn", "retinanet_mobilenet_v3_large"]:
model = torchvision.models.detection.__dict__[name](
num_classes=2, min_size=100, max_size=100, pretrained_backbone=False)

Expand Down
25 changes: 11 additions & 14 deletions torchvision/models/detection/backbone_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,11 @@ def _validate_trainable_layers(pretrained, trainable_backbone_layers, max_value,
return trainable_backbone_layers


def mobilenet_fpn_backbone(
def mobilenet_backbone(
backbone_name,
pretrained,
norm_layer=misc_nn_ops.FrozenBatchNorm2d,
trainable_layers=2,
returned_layers=None,
extra_blocks=None
trainable_layers=2
):
backbone = mobilenet.__dict__[backbone_name](pretrained=pretrained, norm_layer=norm_layer).features

Expand All @@ -149,14 +147,13 @@ def mobilenet_fpn_backbone(
for parameter in b.parameters():
parameter.requires_grad_(False)

if extra_blocks is None:
extra_blocks = LastLevelMaxPool()

if returned_layers is None:
returned_layers = [num_stages - 2, num_stages - 1]
assert min(returned_layers) >= 0 and max(returned_layers) < num_stages
return_layers = {f'{stage_indeces[k]}': str(v) for v, k in enumerate(returned_layers)}

in_channels_list = [backbone[stage_indeces[i]].out_channels for i in returned_layers]
backbone_channels = backbone[-1].out_channels
out_channels = 256
return BackboneWithFPN(backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)

m = nn.Sequential(
backbone,
# depthwise linear combination of channels to reduce their size
nn.Conv2d(backbone_channels, out_channels, 1),
)
m.out_channels = out_channels
return m
35 changes: 18 additions & 17 deletions torchvision/models/detection/retinanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
from . import _utils as det_utils
from .anchor_utils import AnchorGenerator
from .transform import GeneralizedRCNNTransform
from .backbone_utils import resnet_fpn_backbone, _validate_trainable_layers, mobilenet_fpn_backbone
from .backbone_utils import resnet_fpn_backbone, _validate_trainable_layers, mobilenet_backbone
from ...ops.feature_pyramid_network import LastLevelP6P7
from ...ops import sigmoid_focal_loss
from ...ops import boxes as box_ops


__all__ = [
"RetinaNet", "retinanet_resnet50_fpn", "retinanet_mobilenet_v3_large_fpn"
"RetinaNet", "retinanet_resnet50_fpn", "retinanet_mobilenet_v3_large"
]


Expand Down Expand Up @@ -559,8 +559,7 @@ def forward(self, images, targets=None):

# TODO: replace with pytorch links
model_urls = {
'retinanet_mobilenet_v3_large_fpn_coco':
'https://download.pytorch.org/models/retinanet_mobilenet_v3_large_fpn-41c847a4.pth',
'retinanet_mobilenet_v3_large_coco': None,
'retinanet_resnet50_fpn_coco':
'https://download.pytorch.org/models/retinanet_resnet50_fpn_coco-eeacb38b.pth',
}
Expand Down Expand Up @@ -627,15 +626,15 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
return model


def retinanet_mobilenet_v3_large_fpn(pretrained=False, progress=True, num_classes=91, pretrained_backbone=True,
trainable_backbone_layers=None, **kwargs):
def retinanet_mobilenet_v3_large(pretrained=False, progress=True, num_classes=91, pretrained_backbone=True,
trainable_backbone_layers=None, min_size=320, max_size=640, **kwargs):
"""
Constructs a RetinaNet model with a MobileNetV3-Large-FPN backbone. It works similarly
Constructs a RetinaNet model with a MobileNetV3-Large backbone. It works similarly
to RetinaNet with ResNet-50-FPN backbone. See `retinanet_resnet50_fpn` for more details.

Example::

>>> model = torchvision.models.detection.retinanet_mobilenet_v3_large_fpn(pretrained=True)
>>> model = torchvision.models.detection.retinanet_mobilenet_v3_large(pretrained=True)
>>> model.eval()
>>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
>>> predictions = model(x)
Expand All @@ -647,22 +646,24 @@ def retinanet_mobilenet_v3_large_fpn(pretrained=False, progress=True, num_classe
pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
Valid values are between 0 and 6, with 6 meaning all backbone layers are trainable.
min_size (int): minimum size of the image to be rescaled before feeding it to the backbone
max_size (int): maximum size of the image to be rescaled before feeding it to the backbone
"""
# check default parameters and by default set it to 3 if possible
# check default parameters and by default set it to 6 if possible
trainable_backbone_layers = _validate_trainable_layers(
pretrained or pretrained_backbone, trainable_backbone_layers, 6, 3)
pretrained or pretrained_backbone, trainable_backbone_layers, 6, 6)

if pretrained:
pretrained_backbone = False
backbone = mobilenet_fpn_backbone("mobilenet_v3_large", pretrained_backbone, returned_layers=[4, 5],
trainable_layers=trainable_backbone_layers)
backbone = mobilenet_backbone("mobilenet_v3_large", pretrained_backbone,
trainable_layers=trainable_backbone_layers)

anchor_sizes = ((128,), (256,), (512,))
aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
anchor_sizes = ((16, 32, 64, 128, 256,), )
aspect_ratios = ((0.5, 1.0, 2.0), )

model = RetinaNet(backbone, num_classes, anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios), **kwargs)
model = RetinaNet(backbone, num_classes, anchor_generator=AnchorGenerator(anchor_sizes, aspect_ratios),
min_size=min_size, max_size=max_size, **kwargs)
if pretrained:
state_dict = load_state_dict_from_url(model_urls['retinanet_mobilenet_v3_large_fpn_coco'],
progress=progress)
state_dict = load_state_dict_from_url(model_urls['retinanet_mobilenet_v3_large_coco'], progress=progress)
model.load_state_dict(state_dict)
return model