object detection

class SingleStageDetector(nn.Module):
  def __init__(self):
    super().__init__()

    self.feat_extractor = FeatureExtractor()
    self.num_classes = 20
    self.num_bboxes = 2
    self.pred_network = PredictionNetwork(1280, num_bboxes=2, \
                                          num_classes=self.num_classes)
  

  def forward(self, images, bboxes):
    """
    Training-time forward pass for the single-stage detector.

    Inputs:
    - images: Input images, of shape (B, 3, 224, 224)
    - bboxes: GT bounding boxes of shape (B, N, 5) (padded)

    Outputs:
    - total_loss: Torch scalar giving the total loss for the batch.
    """
    # weights to multiple to each loss term
    w_conf = 1 # for conf_scores
    w_reg = 1 # for offsets
    w_cls = 1 # for class_prob

    total_loss = None             

    # 1. Feature extraction
    features = self.feat_extractor(images)

    # 2. Grid generator
    grid_list = GenerateGrid(images.shape[0])

    # 3. Prediction Network
    bbox_xywh, conf_scores, cls_scores = self.pred_network(features)
    # (B, A, 4, H, W), (B, A, H, W), (B, C, H, W)
    
    B, A, _, H, W = bbox_xywh.shape
    bbox_xywh = bbox_xywh.permute(0, 1, 3, 4, 2) # (B, A, H, W, 4)

    assert bbox_xywh.max() <= 1 and bbox_xywh.min() >= -0.5, 'invalid offsets values'

    # 4. Calculate the proposals
    proposals = GenerateProposal(grid_list, bbox_xywh)

    # 5. Compute IoU
    iou_mat = IoU(proposals, bboxes)

    # 7. Using the activated_anc_ind, select the activated conf_scores, bbox_xywh, cls_scores
    activated_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, GT_class, _, _ \
      = ReferenceOnActivatedBboxes(bbox_xywh, bboxes, grid_list, iou_mat, neg_thresh=0.3)

    conf_scores = conf_scores.view(B, A, 1, H, W)
    pos = self._extract_bbox_data(conf_scores, activated_anc_ind)
    neg = self._extract_bbox_data(conf_scores, negative_anc_ind)
    conf_scores = torch.cat([pos, neg], dim = 0)

    # 6. The loss function
    bbox_xywh[:, :, :, :, 2:4] = torch.sqrt(bbox_xywh[:, :, :, :, 2:4])

    # assert bbox_xywh[:, :, :, :, :2].max() <= 0.5 and bbox_xywh[:, :, :, :, :2].min() >= -0.5, 'invalid offsets values'
    # assert bbox_xywh[:, :, :, :, :2:4].max() <= 1 and bbox_xywh[:, :, :, :, 2:4].min() >= 0, 'invalid offsets values'
    
    offsets = self._extract_bbox_data(bbox_xywh.permute(0, 1, 4, 2, 3), activated_anc_ind)
    cls_scores = self._extract_class_scores(cls_scores, activated_anc_ind)
    anc_per_img = torch.prod(torch.tensor(bbox_xywh.shape[1:-1])) # use as argument in ObjectClassification
  
    # 8. Compute losses
    batch_size = images.shape[0]
    conf_loss = ConfScoreRegression(conf_scores, GT_conf_scores)
    reg_loss = BboxRegression(offsets, GT_offsets)
    cls_loss = ObjectClassification(cls_scores,GT_class,batch_size,anc_per_img,activated_anc_ind)
    total_loss = (w_conf * conf_loss) + (w_reg * reg_loss) + (w_cls * cls_loss)

    ###########################################################################
  
    ###########################################################################
  
    print('(weighted) conf loss: {:.4f}, reg loss: {:.4f}, cls loss: {:.4f}'.format(conf_loss, reg_loss, cls_loss))

    return total_loss
  
  def inference(self):
    raise NotImplementedError
  
  def _extract_bbox_data(self, bbox_data, bbox_idx):
    """
    Inputs:
    - bbox_data: Tensor of shape (B, A, D, H, W) giving a vector of length
      D for each of A bboxes at each point in an H x W grid.
    - bbox_idx: int64 Tensor of shape (M,) giving bbox indices to extract

    Returns:
    - extracted_bboxes: Tensor of shape (M, D) giving bbox data for each
      of the bboxes specified by bbox_idx.
    """
    B, A, D, H, W = bbox_data.shape
    bbox_data = bbox_data.permute(0, 1, 3, 4, 2).contiguous().view(-1, D)
    extracted_bboxes = bbox_data[bbox_idx]
    return extracted_bboxes
  
  def _extract_class_scores(self, all_scores, bbox_idx):
    """
    Inputs:
    - all_scores: Tensor of shape (B, C, H, W) giving classification scores for
      C classes at each point in an H x W grid.
    - bbox_idx: int64 Tensor of shape (M,) giving the indices of bboxes at
      which to extract classification scores

    Returns:
    - extracted_scores: Tensor of shape (M, C) giving the classification scores
      for each of the bboxes specified by bbox_idx.
    """
    B, C, H, W = all_scores.shape
    A = self.num_bboxes
    all_scores = all_scores.contiguous().permute(0, 2, 3, 1).contiguous()
    all_scores = all_scores.view(B, 1, H, W, C).expand(B, A, H, W, C)
    all_scores = all_scores.reshape(B * A * H * W, C)
    extracted_scores = all_scores[bbox_idx]
    return extracted_scores