diff --git a/mmdet/core/bbox/assigners/__init__.py b/mmdet/core/bbox/assigners/__init__.py index 5eaf7fa3af6..d6480a783be 100644 --- a/mmdet/core/bbox/assigners/__init__.py +++ b/mmdet/core/bbox/assigners/__init__.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .approx_max_iou_assigner import ApproxMaxIoUAssigner +from .ascend_assign_result import AscendAssignResult +from .ascend_max_iou_assigner import AscendMaxIoUAssigner from .assign_result import AssignResult from .atss_assigner import ATSSAssigner from .base_assigner import BaseAssigner @@ -18,5 +20,6 @@ 'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult', 'PointAssigner', 'ATSSAssigner', 'CenterRegionAssigner', 'GridAssigner', 'HungarianAssigner', 'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner', - 'TaskAlignedAssigner', 'MaskHungarianAssigner' + 'TaskAlignedAssigner', 'MaskHungarianAssigner', 'AscendAssignResult', + 'AscendMaxIoUAssigner' ] diff --git a/mmdet/core/bbox/assigners/ascend_assign_result.py b/mmdet/core/bbox/assigners/ascend_assign_result.py new file mode 100644 index 00000000000..03d33c2b59a --- /dev/null +++ b/mmdet/core/bbox/assigners/ascend_assign_result.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.utils import util_mixins + + +class AscendAssignResult(util_mixins.NiceRepr): + """Stores ascend assignments between predicted and truth boxes. + + Arguments: + batch_num_gts (list[int]): the number of truth boxes considered. + batch_pos_mask (IntTensor): Positive samples mask in all images. + batch_neg_mask (IntTensor): Negative samples mask in all images. + batch_max_overlaps (FloatTensor): The max overlaps of all bboxes + and ground truth boxes. + batch_anchor_gt_indes(None | LongTensor): The assigned truth + box index of all anchors. + batch_anchor_gt_labels(None | LongTensor): The gt labels + of all anchors + """ + + def __init__(self, + batch_num_gts, + batch_pos_mask, + batch_neg_mask, + batch_max_overlaps, + batch_anchor_gt_indes=None, + batch_anchor_gt_labels=None): + self.batch_num_gts = batch_num_gts + self.batch_pos_mask = batch_pos_mask + self.batch_neg_mask = batch_neg_mask + self.batch_max_overlaps = batch_max_overlaps + self.batch_anchor_gt_indes = batch_anchor_gt_indes + self.batch_anchor_gt_labels = batch_anchor_gt_labels + # Interface for possible user-defined properties + self._extra_properties = {} diff --git a/mmdet/core/bbox/assigners/ascend_max_iou_assigner.py b/mmdet/core/bbox/assigners/ascend_max_iou_assigner.py new file mode 100644 index 00000000000..f8f528aead6 --- /dev/null +++ b/mmdet/core/bbox/assigners/ascend_max_iou_assigner.py @@ -0,0 +1,178 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from ....utils import masked_fill +from ..builder import BBOX_ASSIGNERS +from ..iou_calculators import build_iou_calculator +from .ascend_assign_result import AscendAssignResult +from .base_assigner import BaseAssigner + + +@BBOX_ASSIGNERS.register_module() +class AscendMaxIoUAssigner(BaseAssigner): + """Assign a corresponding gt bbox or background to each bbox. + + Each proposals will be assigned with `-1`, or a semi-positive integer + indicating the ground truth index. + + - -1: negative sample, no assigned gt + - semi-positive integer: positive sample, index (0-based) of assigned gt + + Args: + pos_iou_thr (float): IoU threshold for positive bboxes. + neg_iou_thr (float or tuple): IoU threshold for negative bboxes. + min_pos_iou (float): Minimum iou for a bbox to be considered as a + positive bbox. Positive samples can have smaller IoU than + pos_iou_thr due to the 4th step (assign max IoU sample to each gt). + `min_pos_iou` is set to avoid assigning bboxes that have extremely + small iou with GT as positive samples. It brings about 0.3 mAP + improvements in 1x schedule but does not affect the performance of + 3x schedule. More comparisons can be found in + `PR #7464 `_. + gt_max_assign_all (bool): Whether to assign all bboxes with the same + highest overlap with some gt to that gt. + ignore_iof_thr (float): IoF threshold for ignoring bboxes (if + `gt_bboxes_ignore` is specified). Negative values mean not + ignoring any bboxes. + ignore_wrt_candidates (bool): Whether to compute the iof between + `bboxes` and `gt_bboxes_ignore`, or the contrary. + match_low_quality (bool): Whether to allow low quality matches. This is + usually allowed for RPN and single stage detectors, but not allowed + in the second stage. Details are demonstrated in Step 4. + gpu_assign_thr (int): The upper bound of the number of GT for GPU + assign. When the number of gt is above this threshold, will assign + on CPU device. Negative values mean not assign on CPU. + """ + + def __init__(self, + pos_iou_thr, + neg_iou_thr, + min_pos_iou=.0, + gt_max_assign_all=True, + ignore_iof_thr=-1, + ignore_wrt_candidates=True, + match_low_quality=True, + gpu_assign_thr=-1, + iou_calculator=dict(type='BboxOverlaps2D')): + self.pos_iou_thr = pos_iou_thr + self.neg_iou_thr = neg_iou_thr + self.min_pos_iou = min_pos_iou + self.gt_max_assign_all = gt_max_assign_all + self.ignore_iof_thr = ignore_iof_thr + self.ignore_wrt_candidates = ignore_wrt_candidates + self.gpu_assign_thr = gpu_assign_thr + self.match_low_quality = match_low_quality + self.iou_calculator = build_iou_calculator(iou_calculator) + + def assign(self, + batch_bboxes, + batch_gt_bboxes, + batch_gt_bboxes_ignore=None, + batch_gt_labels=None, + batch_bboxes_ignore_mask=None, + batch_num_gts=None): + """Assign gt to bboxes. + + Args: + batch_bboxes (Tensor): Bounding boxes to be assigned, + shape(b, n, 4). + batch_gt_bboxes (Tensor): Ground truth boxes, + shape (b, k, 4). + batch_gt_bboxes_ignore (Tensor, optional): Ground truth + bboxes that are labelled as `ignored`, + e.g., crowd boxes in COCO. + batch_gt_labels (Tensor, optional): Label of gt_bboxes, + shape (b, k, ). + batch_bboxes_ignore_mask: (b, n) + batch_num_gts:(b, ) + Returns: + :obj:`AssignResult`: The assign result. + """ + batch_overlaps = self.iou_calculator(batch_gt_bboxes, batch_bboxes) + batch_overlaps = masked_fill( + batch_overlaps, + batch_bboxes_ignore_mask.unsqueeze(1).float(), + -1, + neg=True) + if self.ignore_iof_thr > 0 and batch_gt_bboxes_ignore is not None: + if self.ignore_wrt_candidates: + batch_ignore_overlaps = self.iou_calculator( + batch_bboxes, batch_gt_bboxes_ignore, mode='iof') + batch_ignore_overlaps = masked_fill(batch_ignore_overlaps, + batch_bboxes_ignore_mask, + -1) + batch_ignore_max_overlaps, _ = batch_ignore_overlaps.max(dim=2) + else: + batch_ignore_overlaps = self.iou_calculator( + batch_gt_bboxes_ignore, batch_bboxes, mode='iof') + batch_ignore_overlaps = masked_fill(batch_ignore_overlaps, + batch_bboxes_ignore_mask, + -1) + batch_ignore_max_overlaps, _ = \ + batch_ignore_overlaps.max(dim=1) + batch_ignore_mask = \ + batch_ignore_max_overlaps > self.ignore_iof_thr + batch_overlaps = masked_fill(batch_overlaps, batch_ignore_mask, -1) + batch_assign_result = self.batch_assign_wrt_overlaps( + batch_overlaps, batch_gt_labels, batch_num_gts) + return batch_assign_result + + def batch_assign_wrt_overlaps(self, + batch_overlaps, + batch_gt_labels=None, + batch_num_gts=None): + num_images, num_gts, num_bboxes = batch_overlaps.size() + batch_max_overlaps, batch_argmax_overlaps = batch_overlaps.max(dim=1) + if isinstance(self.neg_iou_thr, float): + batch_neg_mask = \ + ((batch_max_overlaps >= 0) + & (batch_max_overlaps < self.neg_iou_thr)).int() + elif isinstance(self.neg_iou_thr, tuple): + assert len(self.neg_iou_thr) == 2 + batch_neg_mask = \ + ((batch_max_overlaps >= self.neg_iou_thr[0]) + & (batch_max_overlaps < self.neg_iou_thr[1])).int() + else: + batch_neg_mask = torch.zeros( + batch_max_overlaps.size(), + dtype=torch.int, + device=batch_max_overlaps.device) + batch_pos_mask = (batch_max_overlaps >= self.pos_iou_thr).int() + if self.match_low_quality: + batch_gt_max_overlaps, batch_gt_argmax_overlaps = \ + batch_overlaps.max(dim=2) + batch_index_bool = (batch_gt_max_overlaps >= self.min_pos_iou) & \ + (batch_gt_max_overlaps > 0) + if self.gt_max_assign_all: + pos_inds_low_quality = \ + (batch_overlaps == batch_gt_max_overlaps.unsqueeze(2)) & \ + batch_index_bool.unsqueeze(2) + for i in range(num_gts): + pos_inds_low_quality_gt = pos_inds_low_quality[:, i, :] + batch_argmax_overlaps[pos_inds_low_quality_gt] = i + batch_pos_mask[pos_inds_low_quality_gt] = 1 + else: + index_temp = torch.arange( + 0, num_gts, device=batch_max_overlaps.device) + for index_image in range(num_images): + gt_argmax_overlaps = batch_gt_argmax_overlaps[index_image] + index_bool = batch_index_bool[index_image] + pos_inds_low_quality = gt_argmax_overlaps[index_bool] + batch_argmax_overlaps[index_image][pos_inds_low_quality] \ + = index_temp[index_bool] + batch_pos_mask[index_image][pos_inds_low_quality] = 1 + batch_neg_mask = batch_neg_mask * (1 - batch_pos_mask) + if batch_gt_labels is not None: + batch_anchor_gt_labels = torch.zeros((num_images, num_bboxes), + dtype=batch_gt_labels.dtype, + device=batch_gt_labels.device) + for index_image in range(num_images): + batch_anchor_gt_labels[index_image] = torch.index_select( + batch_gt_labels[index_image], 0, + batch_argmax_overlaps[index_image]) + else: + batch_anchor_gt_labels = None + return AscendAssignResult(batch_num_gts, batch_pos_mask, + batch_neg_mask, batch_max_overlaps, + batch_argmax_overlaps, + batch_anchor_gt_labels) diff --git a/mmdet/models/dense_heads/__init__.py b/mmdet/models/dense_heads/__init__.py index 1c2286996e7..9c60ae14796 100644 --- a/mmdet/models/dense_heads/__init__.py +++ b/mmdet/models/dense_heads/__init__.py @@ -1,6 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. from .anchor_free_head import AnchorFreeHead from .anchor_head import AnchorHead +from .ascend_anchor_head import AscendAnchorHead +from .ascend_retina_head import AscendRetinaHead +from .ascend_ssd_head import AscendSSDHead from .atss_head import ATSSHead from .autoassign_head import AutoAssignHead from .cascade_rpn_head import CascadeRPNHead, StageCascadeRPNHead @@ -54,5 +57,6 @@ 'DETRHead', 'YOLOFHead', 'DeformableDETRHead', 'SOLOHead', 'DecoupledSOLOHead', 'CenterNetHead', 'YOLOXHead', 'DecoupledSOLOLightHead', 'LADHead', 'TOODHead', 'MaskFormerHead', - 'Mask2FormerHead', 'SOLOV2Head', 'DDODHead' + 'Mask2FormerHead', 'SOLOV2Head', 'DDODHead', 'AscendAnchorHead', + 'AscendRetinaHead', 'AscendSSDHead' ] diff --git a/mmdet/models/dense_heads/ascend_anchor_head.py b/mmdet/models/dense_heads/ascend_anchor_head.py new file mode 100644 index 00000000000..7d100ba9218 --- /dev/null +++ b/mmdet/models/dense_heads/ascend_anchor_head.py @@ -0,0 +1,389 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from ...core.bbox.assigners import AscendMaxIoUAssigner +from ...core.bbox.samplers import PseudoSampler +from ...utils import (batch_images_to_levels, get_max_num_gt_division_factor, + masked_fill) +from ..builder import HEADS +from .anchor_head import AnchorHead + + +@HEADS.register_module() +class AscendAnchorHead(AnchorHead): + """Ascend Anchor-based head (RetinaNet, SSD, etc.). + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Used in child classes. + anchor_generator (dict): Config dict for anchor generator + bbox_coder (dict): Config of bounding box coder. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Default False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + train_cfg (dict): Training config of anchor head. + test_cfg (dict): Testing config of anchor head. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ # noqa: W605 + + def __init__(self, + num_classes, + in_channels, + feat_channels=256, + anchor_generator=dict( + type='AnchorGenerator', + scales=[8, 16, 32], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + clip_border=True, + target_means=(.0, .0, .0, .0), + target_stds=(1.0, 1.0, 1.0, 1.0)), + reg_decoded_bbox=False, + loss_cls=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict( + type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), + train_cfg=None, + test_cfg=None, + init_cfg=dict(type='Normal', layer='Conv2d', std=0.01)): + super(AscendAnchorHead, self).__init__( + num_classes=num_classes, + in_channels=in_channels, + feat_channels=feat_channels, + anchor_generator=anchor_generator, + bbox_coder=bbox_coder, + reg_decoded_bbox=reg_decoded_bbox, + loss_cls=loss_cls, + loss_bbox=loss_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + + def get_batch_gt_bboxes(self, gt_bboxes_list, num_images, gt_nums, device, + max_gt_labels): + """Get ground truth bboxes of all image. + + Args: + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. + num_images (int): The num of images. + gt_nums(list[int]): The ground truth bboxes num of each image. + device (torch.device | str): Device for returned tensors + max_gt_labels(int): The max ground truth bboxes num of all image. + Returns: + batch_gt_bboxes: (Tensor): Ground truth bboxes of all image. + """ + # a static ground truth boxes. + # Save static gt. Related to Ascend. Helps improve performance + if not hasattr(self, 'batch_gt_bboxes'): + self.batch_gt_bboxes = {} + # a min anchor filled the excess anchor + if not hasattr(self, 'min_anchor'): + self.min_anchor = (-1354, -1344) + if gt_bboxes_list is None: + batch_gt_bboxes = None + else: + if self.batch_gt_bboxes.get(max_gt_labels) is None: + batch_gt_bboxes = torch.zeros((num_images, max_gt_labels, 4), + dtype=gt_bboxes_list[0].dtype, + device=device) + batch_gt_bboxes[:, :, :2] = self.min_anchor[0] + batch_gt_bboxes[:, :, 2:] = self.min_anchor[1] + self.batch_gt_bboxes[max_gt_labels] = batch_gt_bboxes.clone() + else: + batch_gt_bboxes = self.batch_gt_bboxes.get( + max_gt_labels).clone() + for index_imgs, gt_bboxes in enumerate(gt_bboxes_list): + batch_gt_bboxes[index_imgs, :gt_nums[index_imgs]] = gt_bboxes + return batch_gt_bboxes + + def get_batch_gt_bboxes_ignore(self, gt_bboxes_ignore_list, num_images, + gt_nums, device): + """Ground truth bboxes to be ignored of all image. + + Args: + gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be + ignored. + num_images (int): The num of images. + gt_nums(list[int]): The ground truth bboxes num of each image. + device (torch.device | str): Device for returned tensors + Returns: + batch_gt_bboxes_ignore: (Tensor): Ground truth bboxes to be + ignored of all image. + """ + # TODO: support gt_bboxes_ignore_list + if gt_bboxes_ignore_list is None: + batch_gt_bboxes_ignore = None + else: + raise RuntimeError('gt_bboxes_ignore not support yet') + return batch_gt_bboxes_ignore + + def get_batch_gt_labels(self, gt_labels_list, num_images, gt_nums, device, + max_gt_labels): + """Ground truth bboxes to be ignored of all image. + + Args: + gt_labels_list (list[Tensor]): Ground truth labels. + num_images (int): The num of images. + gt_nums(list[int]): The ground truth bboxes num of each image. + device (torch.device | str): Device for returned tensors + Returns: + batch_gt_labels: (Tensor): Ground truth labels of all image. + """ + if gt_labels_list is None: + batch_gt_labels = None + else: + batch_gt_labels = torch.zeros((num_images, max_gt_labels), + dtype=gt_labels_list[0].dtype, + device=device) + for index_imgs, gt_labels in enumerate(gt_labels_list): + batch_gt_labels[index_imgs, :gt_nums[index_imgs]] = gt_labels + + return batch_gt_labels + + def _get_targets_concat(self, + batch_anchors, + batch_valid_flags, + batch_gt_bboxes, + batch_gt_bboxes_ignore, + batch_gt_labels, + img_metas, + label_channels=1, + unmap_outputs=True): + """Compute regression and classification targets for anchors in all + images. + + Args: + batch_anchors (Tensor): anchors of all image, which are + concatenated into a single tensor of + shape (num_imgs, num_anchors ,4). + batch_valid_flags (Tensor): valid flags of all image, + which are concatenated into a single tensor of + shape (num_imgs, num_anchors,). + batch_gt_bboxes (Tensor): Ground truth bboxes of all image, + shape (num_imgs, max_gt_nums, 4). + batch_gt_bboxes_ignore (Tensor): Ground truth bboxes to be + ignored, shape (num_imgs, num_ignored_gts, 4). + batch_gt_labels (Tensor): Ground truth labels of each box, + shape (num_imgs, max_gt_nums,). + img_metas (list[dict]): Meta info of each image. + label_channels (int): Channel of label. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + + Returns: + tuple: + batch_labels (Tensor): Labels of all level + batch_label_weights (Tensor): Label weights of all level + batch_bbox_targets (Tensor): BBox targets of all level + batch_bbox_weights (Tensor): BBox weights of all level + batch_pos_mask (Tensor): Positive samples mask in all images + batch_neg_mask (Tensor): Negative samples mask in all images + sampling_result (Sampling): The result of sampling, + default: None. + """ + num_imgs, num_anchors, _ = batch_anchors.size() + # assign gt and sample batch_anchors + assign_result = self.assigner.assign( + batch_anchors, + batch_gt_bboxes, + batch_gt_bboxes_ignore, + None if self.sampling else batch_gt_labels, + batch_bboxes_ignore_mask=batch_valid_flags) + # TODO: support sampling_result + sampling_result = None + batch_pos_mask = assign_result.batch_pos_mask + batch_neg_mask = assign_result.batch_neg_mask + batch_anchor_gt_indes = assign_result.batch_anchor_gt_indes + batch_anchor_gt_labels = assign_result.batch_anchor_gt_labels + + batch_anchor_gt_bboxes = torch.zeros( + batch_anchors.size(), + dtype=batch_anchors.dtype, + device=batch_anchors.device) + for index_imgs in range(num_imgs): + batch_anchor_gt_bboxes[index_imgs] = torch.index_select( + batch_gt_bboxes[index_imgs], 0, + batch_anchor_gt_indes[index_imgs]) + + batch_bbox_targets = torch.zeros_like(batch_anchors) + batch_bbox_weights = torch.zeros_like(batch_anchors) + batch_labels = batch_anchors.new_full((num_imgs, num_anchors), + self.num_classes, + dtype=torch.int) + batch_label_weights = batch_anchors.new_zeros((num_imgs, num_anchors), + dtype=torch.float) + + if not self.reg_decoded_bbox: + batch_pos_bbox_targets = self.bbox_coder.encode( + batch_anchors, batch_anchor_gt_bboxes) + else: + batch_pos_bbox_targets = batch_anchor_gt_bboxes + + batch_bbox_targets = masked_fill(batch_bbox_targets, + batch_pos_mask.unsqueeze(2), + batch_pos_bbox_targets) + batch_bbox_weights = masked_fill(batch_bbox_weights, + batch_pos_mask.unsqueeze(2), 1.0) + if batch_gt_labels is None: + batch_labels = masked_fill(batch_labels, batch_pos_mask, 0.0) + else: + batch_labels = masked_fill(batch_labels, batch_pos_mask, + batch_anchor_gt_labels) + if self.train_cfg.pos_weight <= 0: + batch_label_weights = masked_fill(batch_label_weights, + batch_pos_mask, 1.0) + else: + batch_label_weights = masked_fill(batch_label_weights, + batch_pos_mask, + self.train_cfg.pos_weight) + batch_label_weights = masked_fill(batch_label_weights, batch_neg_mask, + 1.0) + return (batch_labels, batch_label_weights, batch_bbox_targets, + batch_bbox_weights, batch_pos_mask, batch_neg_mask, + sampling_result) + + def get_targets(self, + anchor_list, + valid_flag_list, + gt_bboxes_list, + img_metas, + gt_bboxes_ignore_list=None, + gt_labels_list=None, + label_channels=1, + unmap_outputs=True, + return_sampling_results=False, + return_level=True): + """Compute regression and classification targets for anchors in + multiple images. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. + img_metas (list[dict]): Meta info of each image. + gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be + ignored. + gt_labels_list (list[Tensor]): Ground truth labels of each box. + label_channels (int): Channel of label. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + return_sampling_results (bool): Whether to return the result of + sample. + return_level (bool): Whether to map outputs back to the levels + of feature map sizes. + Returns: + tuple: Usually returns a tuple containing learning targets. + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - bbox_weights_list (list[Tensor]): BBox weights of each level. + - num_total_pos (int): Number of positive samples in all + images. + - num_total_neg (int): Number of negative samples in all + images. + + additional_returns: This function enables user-defined returns from + `self._get_targets_single`. These returns are currently refined + to properties at each feature map (i.e. having HxW dimension). + The results will be concatenated after the end + """ + assert gt_bboxes_ignore_list is None + assert unmap_outputs is True + assert return_sampling_results is False + assert self.train_cfg.allowed_border < 0 + assert isinstance(self.assigner, AscendMaxIoUAssigner) + assert isinstance(self.sampler, PseudoSampler) + num_imgs = len(img_metas) + assert len(anchor_list) == len(valid_flag_list) == num_imgs + + device = anchor_list[0][0].device + num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]] + + batch_anchor_list = [] + batch_valid_flag_list = [] + for i in range(num_imgs): + assert len(anchor_list[i]) == len(valid_flag_list[i]) + batch_anchor_list.append(torch.cat(anchor_list[i])) + batch_valid_flag_list.append(torch.cat(valid_flag_list[i])) + batch_anchors = torch.cat( + [torch.unsqueeze(anchor, 0) for anchor in batch_anchor_list], 0) + batch_valid_flags = torch.cat([ + torch.unsqueeze(batch_valid_flag, 0) + for batch_valid_flag in batch_valid_flag_list + ], 0) + + gt_nums = [len(gt_bbox) for gt_bbox in gt_bboxes_list] + max_gt_nums = get_max_num_gt_division_factor(gt_nums) + batch_gt_bboxes = self.get_batch_gt_bboxes(gt_bboxes_list, num_imgs, + gt_nums, device, + max_gt_nums) + batch_gt_bboxes_ignore = self.get_batch_gt_bboxes_ignore( + gt_bboxes_ignore_list, num_imgs, gt_nums, device) + batch_gt_labels = self.get_batch_gt_labels(gt_labels_list, num_imgs, + gt_nums, device, + max_gt_nums) + + results = self._get_targets_concat( + batch_anchors, + batch_valid_flags, + batch_gt_bboxes, + batch_gt_bboxes_ignore, + batch_gt_labels, + img_metas, + label_channels=label_channels, + unmap_outputs=unmap_outputs) + + (batch_labels, batch_label_weights, batch_bbox_targets, + batch_bbox_weights, batch_pos_mask, batch_neg_mask, + sampling_result) = results[:7] + rest_results = list(results[7:]) # user-added return values + + # sampled anchors of all images + min_num = torch.ones((num_imgs, ), + dtype=torch.long, + device=batch_pos_mask.device) + num_total_pos = torch.sum( + torch.max(torch.sum(batch_pos_mask, dim=1), min_num)) + num_total_neg = torch.sum( + torch.max(torch.sum(batch_neg_mask, dim=1), min_num)) + if return_level is True: + labels_list = batch_images_to_levels(batch_labels, + num_level_anchors) + label_weights_list = batch_images_to_levels( + batch_label_weights, num_level_anchors) + bbox_targets_list = batch_images_to_levels(batch_bbox_targets, + num_level_anchors) + bbox_weights_list = batch_images_to_levels(batch_bbox_weights, + num_level_anchors) + res = (labels_list, label_weights_list, bbox_targets_list, + bbox_weights_list, num_total_pos, num_total_neg) + if return_sampling_results: + res = res + (sampling_result, ) + for i, r in enumerate(rest_results): # user-added return values + rest_results[i] = batch_images_to_levels(r, num_level_anchors) + + return res + tuple(rest_results) + else: + res = (batch_labels, batch_label_weights, batch_bbox_targets, + batch_bbox_weights, batch_pos_mask, batch_neg_mask, + sampling_result, num_total_pos, num_total_neg, + batch_anchors) + return res diff --git a/mmdet/models/dense_heads/ascend_retina_head.py b/mmdet/models/dense_heads/ascend_retina_head.py new file mode 100644 index 00000000000..159fe75c1ca --- /dev/null +++ b/mmdet/models/dense_heads/ascend_retina_head.py @@ -0,0 +1,115 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..builder import HEADS +from .ascend_anchor_head import AscendAnchorHead +from .retina_head import RetinaHead + + +@HEADS.register_module() +class AscendRetinaHead(RetinaHead, AscendAnchorHead): + r"""An anchor-based head used in `RetinaNet + `_. + + The head contains two subnetworks. The first classifies anchor boxes and + the second regresses deltas for the anchors. + + Example: + >>> import torch + >>> self = RetinaHead(11, 7) + >>> x = torch.rand(1, 7, 32, 32) + >>> cls_score, bbox_pred = self.forward_single(x) + >>> # Each anchor predicts a score for each class except background + >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors + >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors + >>> assert cls_per_anchor == (self.num_classes) + >>> assert box_per_anchor == 4 + """ + + def __init__(self, + num_classes, + in_channels, + stacked_convs=4, + conv_cfg=None, + norm_cfg=None, + anchor_generator=dict( + type='AnchorGenerator', + octave_base_scale=4, + scales_per_octave=3, + ratios=[0.5, 1.0, 2.0], + strides=[8, 16, 32, 64, 128]), + init_cfg=dict( + type='Normal', + layer='Conv2d', + std=0.01, + override=dict( + type='Normal', + name='retina_cls', + std=0.01, + bias_prob=0.01)), + **kwargs): + super(AscendRetinaHead, self).__init__( + num_classes=num_classes, + in_channels=in_channels, + stacked_convs=stacked_convs, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + anchor_generator=anchor_generator, + init_cfg=init_cfg, + **kwargs) + + def get_targets(self, + anchor_list, + valid_flag_list, + gt_bboxes_list, + img_metas, + gt_bboxes_ignore_list=None, + gt_labels_list=None, + label_channels=1, + unmap_outputs=True, + return_sampling_results=False, + return_level=True): + """Compute regression and classification targets for anchors in + multiple images. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. + img_metas (list[dict]): Meta info of each image. + gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be + ignored. + gt_labels_list (list[Tensor]): Ground truth labels of each box. + label_channels (int): Channel of label. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + return_sampling_results (bool): Whether to return the result of + sample. + return_level (bool): Whether to map outputs back to the levels + of feature map sizes. + Returns: + tuple: Usually returns a tuple containing learning targets. + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - bbox_weights_list (list[Tensor]): BBox weights of each level. + - num_total_pos (int): Number of positive samples in all + images. + - num_total_neg (int): Number of negative samples in all + images. + + additional_returns: This function enables user-defined returns from + `self._get_targets_single`. These returns are currently refined + to properties at each feature map (i.e. having HxW dimension). + The results will be concatenated after the end + """ + return AscendAnchorHead.get_targets( + self, anchor_list, valid_flag_list, gt_bboxes_list, img_metas, + gt_bboxes_ignore_list, gt_labels_list, label_channels, + unmap_outputs, return_sampling_results, return_level) diff --git a/mmdet/models/dense_heads/ascend_ssd_head.py b/mmdet/models/dense_heads/ascend_ssd_head.py new file mode 100644 index 00000000000..9e326b48bc1 --- /dev/null +++ b/mmdet/models/dense_heads/ascend_ssd_head.py @@ -0,0 +1,328 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F +from mmcv.runner import force_fp32 + +from ..builder import HEADS +from ..losses import smooth_l1_loss +from .ascend_anchor_head import AscendAnchorHead +from .ssd_head import SSDHead + + +@HEADS.register_module() +class AscendSSDHead(SSDHead, AscendAnchorHead): + """Ascend SSD head used in https://arxiv.org/abs/1512.02325. + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + stacked_convs (int): Number of conv layers in cls and reg tower. + Default: 0. + feat_channels (int): Number of hidden channels when stacked_convs + > 0. Default: 256. + use_depthwise (bool): Whether to use DepthwiseSeparableConv. + Default: False. + conv_cfg (dict): Dictionary to construct and config conv layer. + Default: None. + norm_cfg (dict): Dictionary to construct and config norm layer. + Default: None. + act_cfg (dict): Dictionary to construct and config activation layer. + Default: None. + anchor_generator (dict): Config dict for anchor generator + bbox_coder (dict): Config of bounding box coder. + reg_decoded_bbox (bool): If true, the regression loss would be + applied directly on decoded bounding boxes, converting both + the predicted boxes and regression targets to absolute + coordinates format. Default False. It should be `True` when + using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head. + train_cfg (dict): Training config of anchor head. + test_cfg (dict): Testing config of anchor head. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ # noqa: W605 + + def __init__(self, + num_classes=80, + in_channels=(512, 1024, 512, 256, 256, 256), + stacked_convs=0, + feat_channels=256, + use_depthwise=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + input_size=300, + strides=[8, 16, 32, 64, 100, 300], + ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]), + basesize_ratio_range=(0.1, 0.9)), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + clip_border=True, + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0], + ), + reg_decoded_bbox=False, + train_cfg=None, + test_cfg=None, + init_cfg=dict( + type='Xavier', + layer='Conv2d', + distribution='uniform', + bias=0)): + super(AscendSSDHead, self).__init__( + num_classes=num_classes, + in_channels=in_channels, + stacked_convs=stacked_convs, + feat_channels=feat_channels, + use_depthwise=use_depthwise, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + anchor_generator=anchor_generator, + bbox_coder=bbox_coder, + reg_decoded_bbox=reg_decoded_bbox, + train_cfg=train_cfg, + test_cfg=test_cfg, + init_cfg=init_cfg) + assert self.reg_decoded_bbox is False, \ + 'reg_decoded_bbox only support False now.' + + def get_static_anchors(self, featmap_sizes, img_metas, device='cuda'): + """Get static anchors according to feature map sizes. + + Args: + featmap_sizes (list[tuple]): Multi-level feature map sizes. + img_metas (list[dict]): Image meta info. + device (torch.device | str): Device for returned tensors + + Returns: + tuple: + anchor_list (list[Tensor]): Anchors of each image. + valid_flag_list (list[Tensor]): Valid flags of each image. + """ + if not hasattr(self, 'static_anchors') or \ + not hasattr(self, 'static_valid_flags'): + static_anchors, static_valid_flags = self.get_anchors( + featmap_sizes, img_metas, device) + self.static_anchors = static_anchors + self.static_valid_flags = static_valid_flags + return self.static_anchors, self.static_valid_flags + + def get_targets(self, + anchor_list, + valid_flag_list, + gt_bboxes_list, + img_metas, + gt_bboxes_ignore_list=None, + gt_labels_list=None, + label_channels=1, + unmap_outputs=True, + return_sampling_results=False, + return_level=True): + """Compute regression and classification targets for anchors in + multiple images. + + Args: + anchor_list (list[list[Tensor]]): Multi level anchors of each + image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, 4). + valid_flag_list (list[list[Tensor]]): Multi level valid flags of + each image. The outer list indicates images, and the inner list + corresponds to feature levels of the image. Each element of + the inner list is a tensor of shape (num_anchors, ) + gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image. + img_metas (list[dict]): Meta info of each image. + gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be + ignored. + gt_labels_list (list[Tensor]): Ground truth labels of each box. + label_channels (int): Channel of label. + unmap_outputs (bool): Whether to map outputs back to the original + set of anchors. + return_sampling_results (bool): Whether to return the result of + sample. + return_level (bool): Whether to map outputs back to the levels + of feature map sizes. + Returns: + tuple: Usually returns a tuple containing learning targets. + + - labels_list (list[Tensor]): Labels of each level. + - label_weights_list (list[Tensor]): Label weights of each + level. + - bbox_targets_list (list[Tensor]): BBox targets of each level. + - bbox_weights_list (list[Tensor]): BBox weights of each level. + - num_total_pos (int): Number of positive samples in all + images. + - num_total_neg (int): Number of negative samples in all + images. + + additional_returns: This function enables user-defined returns from + `self._get_targets_single`. These returns are currently refined + to properties at each feature map (i.e. having HxW dimension). + The results will be concatenated after the end + """ + return AscendAnchorHead.get_targets( + self, + anchor_list, + valid_flag_list, + gt_bboxes_list, + img_metas, + gt_bboxes_ignore_list, + gt_labels_list, + label_channels, + unmap_outputs, + return_sampling_results, + return_level, + ) + + def batch_loss(self, batch_cls_score, batch_bbox_pred, batch_anchor, + batch_labels, batch_label_weights, batch_bbox_targets, + batch_bbox_weights, batch_pos_mask, batch_neg_mask, + num_total_samples): + """Compute loss of all images. + + Args: + batch_cls_score (Tensor): Box scores for all image + Has shape (num_imgs, num_total_anchors, num_classes). + batch_bbox_pred (Tensor): Box energies / deltas for all image + level with shape (num_imgs, num_total_anchors, 4). + batch_anchor (Tensor): Box reference for all image with shape + (num_imgs, num_total_anchors, 4). + batch_labels (Tensor): Labels of all anchors with shape + (num_imgs, num_total_anchors,). + batch_label_weights (Tensor): Label weights of all anchor with + shape (num_imgs, num_total_anchors,) + batch_bbox_targets (Tensor): BBox regression targets of all anchor + weight shape (num_imgs, num_total_anchors, 4). + batch_bbox_weights (Tensor): BBox regression loss weights of + all anchor with shape (num_imgs, num_total_anchors, 4). + batch_pos_mask (Tensor): Positive samples mask in all images. + batch_neg_mask (Tensor): negative samples mask in all images. + num_total_samples (int): If sampling, num total samples equal to + the number of total anchors; Otherwise, it is the number of + positive anchors. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + num_images, num_anchors, _ = batch_anchor.size() + + batch_loss_cls_all = F.cross_entropy( + batch_cls_score.view((-1, self.cls_out_channels)), + batch_labels.view(-1), + reduction='none').view( + batch_label_weights.size()) * batch_label_weights + # # FG cat_id: [0, num_classes -1], BG cat_id: num_classes + batch_num_pos_samples = torch.sum(batch_pos_mask, dim=1) + batch_num_neg_samples = \ + self.train_cfg.neg_pos_ratio * batch_num_pos_samples + + batch_num_neg_samples_max = torch.sum(batch_neg_mask, dim=1) + batch_num_neg_samples = torch.min(batch_num_neg_samples, + batch_num_neg_samples_max) + + batch_topk_loss_cls_neg, _ = torch.topk( + batch_loss_cls_all * batch_neg_mask, k=num_anchors, dim=1) + batch_loss_cls_pos = torch.sum( + batch_loss_cls_all * batch_pos_mask, dim=1) + + anchor_index = torch.arange( + end=num_anchors, dtype=torch.float, + device=batch_anchor.device).view((1, -1)) + topk_loss_neg_mask = (anchor_index < batch_num_neg_samples.view( + -1, 1)).float() + + batch_loss_cls_neg = torch.sum( + batch_topk_loss_cls_neg * topk_loss_neg_mask, dim=1) + loss_cls = \ + (batch_loss_cls_pos + batch_loss_cls_neg) / num_total_samples + + if self.reg_decoded_bbox: + # TODO: support self.reg_decoded_bbox is True + raise RuntimeError + + loss_bbox_all = smooth_l1_loss( + batch_bbox_pred, + batch_bbox_targets, + batch_bbox_weights, + reduction='none', + beta=self.train_cfg.smoothl1_beta, + avg_factor=num_total_samples) + eps = torch.finfo(torch.float32).eps + + sum_dim = (i for i in range(1, len(loss_bbox_all.size()))) + loss_bbox = loss_bbox_all.sum(tuple(sum_dim)) / ( + num_total_samples + eps) + return loss_cls[None], loss_bbox + + @force_fp32(apply_to=('cls_scores', 'bbox_preds')) + def loss(self, + cls_scores, + bbox_preds, + gt_bboxes, + gt_labels, + img_metas, + gt_bboxes_ignore=None): + """Compute losses of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + Has shape (N, num_anchors * num_classes, H, W) + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_anchors * 4, H, W) + gt_bboxes (list[Tensor]): each item are the truth boxes for each + image in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): specify which bounding + boxes can be ignored when computing the loss. + + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + assert len(featmap_sizes) == self.prior_generator.num_levels + + device = cls_scores[0].device + + anchor_list, valid_flag_list = self.get_anchors( + featmap_sizes, img_metas, device=device) + cls_reg_targets = self.get_targets( + anchor_list, + valid_flag_list, + gt_bboxes, + img_metas, + gt_bboxes_ignore_list=gt_bboxes_ignore, + gt_labels_list=gt_labels, + label_channels=1, + unmap_outputs=True, + return_level=False) + if cls_reg_targets is None: + return None + + (batch_labels, batch_label_weights, batch_bbox_targets, + batch_bbox_weights, batch_pos_mask, batch_neg_mask, sampling_result, + num_total_pos, num_total_neg, batch_anchors) = cls_reg_targets + + num_imgs = len(img_metas) + batch_cls_score = torch.cat([ + s.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels) + for s in cls_scores + ], 1) + + batch_bbox_pred = torch.cat([ + b.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) for b in bbox_preds + ], -2) + + batch_losses_cls, batch_losses_bbox = self.batch_loss( + batch_cls_score, batch_bbox_pred, batch_anchors, batch_labels, + batch_label_weights, batch_bbox_targets, batch_bbox_weights, + batch_pos_mask, batch_neg_mask, num_total_pos) + losses_cls = [ + batch_losses_cls[:, index_imgs] for index_imgs in range(num_imgs) + ] + losses_bbox = [losses_bbox for losses_bbox in batch_losses_bbox] + return dict(loss_cls=losses_cls, loss_bbox=losses_bbox) diff --git a/mmdet/utils/__init__.py b/mmdet/utils/__init__.py index 9b6e0295a4c..5a384feafdf 100644 --- a/mmdet/utils/__init__.py +++ b/mmdet/utils/__init__.py @@ -1,11 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .ascend_util import (batch_images_to_levels, + get_max_num_gt_division_factor, masked_fill) from .collect_env import collect_env from .compat_config import compat_cfg from .logger import get_caller_name, get_root_logger, log_img_scale from .memory import AvoidCUDAOOM, AvoidOOM from .misc import find_latest_checkpoint, update_data_root from .replace_cfg_vals import replace_cfg_vals -from .rfnext import rfnext_init_model from .setup_env import setup_multi_processes from .split_batch import split_batch from .util_distribution import build_ddp, build_dp, get_device @@ -15,5 +16,5 @@ 'update_data_root', 'setup_multi_processes', 'get_caller_name', 'log_img_scale', 'compat_cfg', 'split_batch', 'build_ddp', 'build_dp', 'get_device', 'replace_cfg_vals', 'AvoidOOM', 'AvoidCUDAOOM', - 'rfnext_init_model' + 'get_max_num_gt_division_factor', 'masked_fill', 'batch_images_to_levels' ] diff --git a/mmdet/utils/ascend_util.py b/mmdet/utils/ascend_util.py new file mode 100644 index 00000000000..df90dec8205 --- /dev/null +++ b/mmdet/utils/ascend_util.py @@ -0,0 +1,69 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def masked_fill(ori_tensor, mask, new_value, neg=False): + """The Value of ori_tensor is new_value, depending on mask. + + Args: + ori_tensor (Tensor): Input tensor. + mask (Tensor): If select new_value. + new_value(Tensor | scalar): Value selected for ori_tensor. + neg (bool): If True, select ori_tensor. If False, select new_value. + Returns: + ori_tensor: (Tensor): The Value of ori_tensor is new_value, + depending on mask. + """ + if mask is None: + return ori_tensor + else: + if neg: + return ori_tensor * mask + new_value * (1 - mask) + else: + return ori_tensor * (1 - mask) + new_value * mask + + +def batch_images_to_levels(target, num_levels): + """Convert targets by image to targets by feature level. + + [target_img0, target_img1] -> [target_level0, target_level1, ...] or + target_imgs -> [target_level0, target_level1, ...] + Args: + target (Tensor | List[Tensor]): Tensor split to image levels. + num_levels (List[int]): Image levels num. + Returns: + level_targets: (Tensor): Tensor split by image levels. + """ + if not isinstance(target, torch.Tensor): + target = torch.stack(target, 0) + level_targets = [] + start = 0 + for n in num_levels: + end = start + n + # level_targets.append(target[:, start:end].squeeze(0)) + level_targets.append(target[:, start:end]) + start = end + return level_targets + + +def get_max_num_gt_division_factor(gt_nums, + min_num_gt=32, + max_num_gt=1024, + division_factor=2): + """Count max num of gt. + + Args: + gt_nums (List[int]): Ground truth bboxes num of images. + min_num_gt (int): Min num of ground truth bboxes. + max_num_gt (int): Max num of ground truth bboxes. + division_factor (int): Division factor of result. + Returns: + max_gt_nums_align: (int): max num of ground truth bboxes. + """ + max_gt_nums = max(gt_nums) + max_gt_nums_align = min_num_gt + while max_gt_nums_align < max_gt_nums: + max_gt_nums_align *= division_factor + if max_gt_nums_align > max_num_gt: + raise RuntimeError + return max_gt_nums_align diff --git a/tests/test_models/test_dense_heads/test_ascend_head.py b/tests/test_models/test_dense_heads/test_ascend_head.py new file mode 100644 index 00000000000..843a55fe7b4 --- /dev/null +++ b/tests/test_models/test_dense_heads/test_ascend_head.py @@ -0,0 +1,215 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import torch + +from mmdet.models.dense_heads import (AscendAnchorHead, AscendRetinaHead, + AscendSSDHead) + + +def test_ascend_anchor_head_loss(): + """Tests AscendAnchorHead loss when truth is empty and non-empty.""" + s = 256 + img_metas = [{ + 'img_shape': (s, s, 3), + 'scale_factor': 1, + 'pad_shape': (s, s, 3) + }] + + cfg = mmcv.Config( + dict( + assigner=dict( + type='AscendMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) + self = AscendAnchorHead(num_classes=4, in_channels=1, train_cfg=cfg) + + # Anchor head expects a multiple levels of features per image + feat = [ + torch.rand(1, 1, s // (2**(i + 2)), s // (2**(i + 2))) + for i in range(len(self.prior_generator.strides)) + ] + cls_scores, bbox_preds = self.forward(feat) + + # Test that empty ground truth encourages the network to predict background + gt_bboxes = [torch.empty((0, 4))] + gt_labels = [torch.LongTensor([])] + + gt_bboxes_ignore = None + empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_metas, gt_bboxes_ignore) + # When there is no truth, the cls loss should be nonzero but there should + # be no box loss. + empty_cls_loss = sum(empty_gt_losses['loss_cls']) + empty_box_loss = sum(empty_gt_losses['loss_bbox']) + assert empty_cls_loss.item() > 0, 'cls loss should be non-zero' + assert empty_box_loss.item() == 0, ( + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero for + # random inputs + gt_bboxes = [ + torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + ] + gt_labels = [torch.LongTensor([2])] + one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_metas, gt_bboxes_ignore) + onegt_cls_loss = sum(one_gt_losses['loss_cls']) + onegt_box_loss = sum(one_gt_losses['loss_bbox']) + assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero' + assert onegt_box_loss.item() > 0, 'box loss should be non-zero' + + +def test_ascend_retina_head_loss(): + """Tests AscendRetinaHead loss when truth is empty and non-empty.""" + img_shape = (800, 1067, 3) + pad_shape = (800, 1088, 3) + num_classes = 80 + in_channels = 256 + + img_metas = [{ + 'img_shape': img_shape, + 'scale_factor': 1, + 'pad_shape': pad_shape + }] + + cfg = mmcv.Config( + dict( + assigner=dict( + type='AscendMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.4, + min_pos_iou=0, + ignore_iof_thr=-1), + allowed_border=-1, + pos_weight=-1, + debug=False)) + self = AscendRetinaHead( + num_classes=num_classes, in_channels=in_channels, train_cfg=cfg) + + # Anchor head expects a multiple levels of features per image + feat = [ + torch.rand(1, in_channels, pad_shape[0] // strides[0], + pad_shape[1] // strides[1]) + for strides in self.prior_generator.strides + ] + cls_scores, bbox_preds = self.forward(feat) + + # Test that empty ground truth encourages the network to predict background + gt_bboxes = [torch.empty((0, 4))] + gt_labels = [torch.LongTensor([])] + + gt_bboxes_ignore = None + empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_metas, gt_bboxes_ignore) + # When there is no truth, the cls loss should be nonzero but there should + # be no box loss. + empty_cls_loss = sum(empty_gt_losses['loss_cls']) + empty_box_loss = sum(empty_gt_losses['loss_bbox']) + assert empty_cls_loss.item() > 0, 'cls loss should be non-zero' + assert empty_box_loss.item() == 0, ( + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero for + # random inputs + gt_bboxes = [ + torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + ] + gt_labels = [torch.LongTensor([2])] + one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_metas, gt_bboxes_ignore) + onegt_cls_loss = sum(one_gt_losses['loss_cls']) + onegt_box_loss = sum(one_gt_losses['loss_bbox']) + assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero' + assert onegt_box_loss.item() > 0, 'box loss should be non-zero' + + +def test_ascend_ssd_head_loss(): + """Tests anchor head loss when truth is empty and non-empty.""" + img_shape = (320, 320, 3) + pad_shape = (320, 320, 3) + in_channels = (96, 1280, 512, 256, 256, 128) + img_metas = [{ + 'img_shape': img_shape, + 'scale_factor': 1, + 'pad_shape': pad_shape + }, { + 'img_shape': img_shape, + 'scale_factor': 1, + 'pad_shape': pad_shape + }] + + self = AscendSSDHead( + in_channels=in_channels, + num_classes=80, + use_depthwise=True, + norm_cfg=dict(type='BN', eps=0.001, momentum=0.03), + act_cfg=dict(type='ReLU6'), + init_cfg=dict(type='Normal', layer='Conv2d', std=0.001), + anchor_generator=dict( + type='SSDAnchorGenerator', + scale_major=False, + strides=[16, 32, 64, 107, 160, 320], + ratios=[[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]], + min_sizes=[48, 100, 150, 202, 253, 304], + max_sizes=[100, 150, 202, 253, 304, 320]), + bbox_coder=dict( + type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[0.1, 0.1, 0.2, 0.2]), + train_cfg=mmcv.Config( + dict( + assigner=dict( + type='AscendMaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0., + ignore_iof_thr=-1, + gt_max_assign_all=False), + smoothl1_beta=1., + allowed_border=-1, + pos_weight=-1, + neg_pos_ratio=3, + debug=False))) + + # Anchor head expects a multiple levels of features per image + feat = [ + torch.rand(2, in_channels[i], + round(pad_shape[0] / self.prior_generator.strides[i][0]), + round(pad_shape[1] / self.prior_generator.strides[i][1])) + for i in range(len(self.prior_generator.strides)) + ] + cls_scores, bbox_preds = self.forward(feat) + + # Test that empty ground truth encourages the network to predict background + gt_bboxes = [torch.empty((0, 4)), torch.empty((0, 4))] + gt_labels = [torch.LongTensor([]), torch.LongTensor([])] + + gt_bboxes_ignore = None + empty_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_metas, gt_bboxes_ignore) + # When there is no truth, the cls loss should be nonzero but there should + # be no box loss. + empty_cls_loss = sum(empty_gt_losses['loss_cls']) + empty_box_loss = sum(empty_gt_losses['loss_bbox']) + assert empty_cls_loss.item() >= 0, 'cls loss should be non-zero' + assert empty_box_loss.item() == 0, ( + 'there should be no box loss when there are no true boxes') + + # When truth is non-empty then both cls and box loss should be nonzero for + # random inputs + gt_bboxes = [ + torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + torch.Tensor([[23.6667, 23.8757, 238.6326, 151.8874]]), + ] + gt_labels = [torch.LongTensor([2]), torch.LongTensor([2])] + one_gt_losses = self.loss(cls_scores, bbox_preds, gt_bboxes, gt_labels, + img_metas, gt_bboxes_ignore) + onegt_cls_loss = sum(one_gt_losses['loss_cls']) + onegt_box_loss = sum(one_gt_losses['loss_bbox']) + assert onegt_cls_loss.item() > 0, 'cls loss should be non-zero' + assert onegt_box_loss.item() > 0, 'box loss should be non-zero' diff --git a/tests/test_utils/test_assigner.py b/tests/test_utils/test_assigner.py index a53d5304b0a..7cdb08ba0fb 100644 --- a/tests/test_utils/test_assigner.py +++ b/tests/test_utils/test_assigner.py @@ -9,6 +9,7 @@ import torch from mmdet.core.bbox.assigners import (ApproxMaxIoUAssigner, + AscendMaxIoUAssigner, CenterRegionAssigner, HungarianAssigner, MaskHungarianAssigner, MaxIoUAssigner, PointAssigner, SimOTAAssigner, @@ -661,3 +662,39 @@ def test_mask_hungarian_match_assigner(): dice_cost=dict(type='DiceCost', weight=0.0, pred_act=True, eps=1.0)) with pytest.raises(AssertionError): self = MaskHungarianAssigner(**assigner_cfg) + + +def test_ascend_max_iou_assigner(): + self = AscendMaxIoUAssigner( + pos_iou_thr=0.5, + neg_iou_thr=0.5, + ) + batch_bboxes = torch.FloatTensor([[ + [0, 0, 10, 10], + [10, 10, 20, 20], + [5, 5, 15, 15], + [32, 32, 38, 42], + ]]) + batch_gt_bboxes = torch.FloatTensor([[ + [0, 0, 10, 9], + [0, 10, 10, 19], + ]]) + batch_gt_labels = torch.LongTensor([[2, 3]]) + batch_bboxes_ignore_mask = torch.IntTensor([[1, 1, 1, 1]]) + assign_result = self.assign( + batch_bboxes, + batch_gt_bboxes, + batch_gt_labels=batch_gt_labels, + batch_bboxes_ignore_mask=batch_bboxes_ignore_mask) + + expected_batch_pos_mask = torch.IntTensor([1, 0, 1, 0]) + expected_batch_anchor_gt_indes = torch.IntTensor([0, 0, 1, 0]) + expected_batch_anchor_gt_labels = torch.IntTensor([2, 0, 3, 0]) + + assert torch.all(assign_result.batch_pos_mask == expected_batch_pos_mask) + assert torch.all( + assign_result.batch_anchor_gt_indes * + assign_result.batch_pos_mask == expected_batch_anchor_gt_indes) + assert torch.all( + assign_result.batch_anchor_gt_labels * + assign_result.batch_pos_mask == expected_batch_anchor_gt_labels)