-
Notifications
You must be signed in to change notification settings - Fork 0
/
object detection
122 lines (94 loc) · 4.59 KB
/
object detection
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
class SingleStageDetector(nn.Module):
def __init__(self):
super().__init__()
self.feat_extractor = FeatureExtractor()
self.num_classes = 20
self.num_bboxes = 2
self.pred_network = PredictionNetwork(1280, num_bboxes=2, \
num_classes=self.num_classes)
def forward(self, images, bboxes):
"""
Training-time forward pass for the single-stage detector.
Inputs:
- images: Input images, of shape (B, 3, 224, 224)
- bboxes: GT bounding boxes of shape (B, N, 5) (padded)
Outputs:
- total_loss: Torch scalar giving the total loss for the batch.
"""
# weights to multiple to each loss term
w_conf = 1 # for conf_scores
w_reg = 1 # for offsets
w_cls = 1 # for class_prob
total_loss = None
# 1. Feature extraction
features = self.feat_extractor(images)
# 2. Grid generator
grid_list = GenerateGrid(images.shape[0])
# 3. Prediction Network
bbox_xywh, conf_scores, cls_scores = self.pred_network(features)
# (B, A, 4, H, W), (B, A, H, W), (B, C, H, W)
B, A, _, H, W = bbox_xywh.shape
bbox_xywh = bbox_xywh.permute(0, 1, 3, 4, 2) # (B, A, H, W, 4)
assert bbox_xywh.max() <= 1 and bbox_xywh.min() >= -0.5, 'invalid offsets values'
# 4. Calculate the proposals
proposals = GenerateProposal(grid_list, bbox_xywh)
# 5. Compute IoU
iou_mat = IoU(proposals, bboxes)
# 7. Using the activated_anc_ind, select the activated conf_scores, bbox_xywh, cls_scores
activated_anc_ind, negative_anc_ind, GT_conf_scores, GT_offsets, GT_class, _, _ \
= ReferenceOnActivatedBboxes(bbox_xywh, bboxes, grid_list, iou_mat, neg_thresh=0.3)
conf_scores = conf_scores.view(B, A, 1, H, W)
pos = self._extract_bbox_data(conf_scores, activated_anc_ind)
neg = self._extract_bbox_data(conf_scores, negative_anc_ind)
conf_scores = torch.cat([pos, neg], dim = 0)
# 6. The loss function
bbox_xywh[:, :, :, :, 2:4] = torch.sqrt(bbox_xywh[:, :, :, :, 2:4])
# assert bbox_xywh[:, :, :, :, :2].max() <= 0.5 and bbox_xywh[:, :, :, :, :2].min() >= -0.5, 'invalid offsets values'
# assert bbox_xywh[:, :, :, :, :2:4].max() <= 1 and bbox_xywh[:, :, :, :, 2:4].min() >= 0, 'invalid offsets values'
offsets = self._extract_bbox_data(bbox_xywh.permute(0, 1, 4, 2, 3), activated_anc_ind)
cls_scores = self._extract_class_scores(cls_scores, activated_anc_ind)
anc_per_img = torch.prod(torch.tensor(bbox_xywh.shape[1:-1])) # use as argument in ObjectClassification
# 8. Compute losses
batch_size = images.shape[0]
conf_loss = ConfScoreRegression(conf_scores, GT_conf_scores)
reg_loss = BboxRegression(offsets, GT_offsets)
cls_loss = ObjectClassification(cls_scores,GT_class,batch_size,anc_per_img,activated_anc_ind)
total_loss = (w_conf * conf_loss) + (w_reg * reg_loss) + (w_cls * cls_loss)
###########################################################################
###########################################################################
print('(weighted) conf loss: {:.4f}, reg loss: {:.4f}, cls loss: {:.4f}'.format(conf_loss, reg_loss, cls_loss))
return total_loss
def inference(self):
raise NotImplementedError
def _extract_bbox_data(self, bbox_data, bbox_idx):
"""
Inputs:
- bbox_data: Tensor of shape (B, A, D, H, W) giving a vector of length
D for each of A bboxes at each point in an H x W grid.
- bbox_idx: int64 Tensor of shape (M,) giving bbox indices to extract
Returns:
- extracted_bboxes: Tensor of shape (M, D) giving bbox data for each
of the bboxes specified by bbox_idx.
"""
B, A, D, H, W = bbox_data.shape
bbox_data = bbox_data.permute(0, 1, 3, 4, 2).contiguous().view(-1, D)
extracted_bboxes = bbox_data[bbox_idx]
return extracted_bboxes
def _extract_class_scores(self, all_scores, bbox_idx):
"""
Inputs:
- all_scores: Tensor of shape (B, C, H, W) giving classification scores for
C classes at each point in an H x W grid.
- bbox_idx: int64 Tensor of shape (M,) giving the indices of bboxes at
which to extract classification scores
Returns:
- extracted_scores: Tensor of shape (M, C) giving the classification scores
for each of the bboxes specified by bbox_idx.
"""
B, C, H, W = all_scores.shape
A = self.num_bboxes
all_scores = all_scores.contiguous().permute(0, 2, 3, 1).contiguous()
all_scores = all_scores.view(B, 1, H, W, C).expand(B, A, H, W, C)
all_scores = all_scores.reshape(B * A * H * W, C)
extracted_scores = all_scores[bbox_idx]
return extracted_scores