cvat-ai · nmanovic · Oct 28, 2019 · Oct 28, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 https://github.com/opencv/cvat/issues/750).
 - Changed REST API: removed PUT and added DELETE methods for /api/v1/users/ID.
 - Added Mask-RCNN Auto Annotation Script
+- Added Yolo Auto Annotation Script
 
 ### Changed
 -

@@ -0,0 +1,22 @@
+# Object Detection YOLO V3 Python Demo, Async API Performance Showcase
+
+See [these instructions][1] for converting the yolo weights to the OpenVino format.
+
+As of OpenVINO 2019 R3, only tensorflow 1.13 and NetworkX 2.3.
+These can be explicitly installed using the following command.
+
+```bash
+$ pip3 install tensorflow==1.13 networkx==2.3
+```
+
+
+Additionally, at the time of writing, the model optimizer required an input shape.
+
+``` bash
+$ python3 mo_tf.py \ 
+          --input_model /path/to/yolo_v3.pb \
+          --tensorflow_use_custom_operations_config $MO_ROOT/extensions/front/tf/yolo_v3.json \
+          --input_shape [1,416,416,3]
+```
+
+[1]: https://docs.openvinotoolkit.org/latest/_docs_MO_DG_prepare_model_convert_model_tf_specific_Convert_YOLO_From_Tensorflow.html
@@ -0,0 +1,160 @@
+from math import exp
+
+
+class Parser:
+    IOU_THRESHOLD = 0.4
+    PROB_THRESHOLD = 0.5
+
+    def __init__(self):
+        self.objects = []
+
+    def scale_bbox(self, x, y, h, w, class_id, confidence, h_scale, w_scale):
+        xmin = int((x - w / 2) * w_scale)
+        ymin = int((y - h / 2) * h_scale)
+        xmax = int(xmin + w * w_scale)
+        ymax = int(ymin + h * h_scale)
+
+        return dict(xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, class_id=class_id, confidence=confidence)
+
+    def entry_index(self, side, coord, classes, location, entry):
+        side_power_2 = side ** 2
+        n = location // side_power_2
+        loc = location % side_power_2
+        return int(side_power_2 * (n * (coord + classes + 1) + entry) + loc)
+
+    def intersection_over_union(self, box_1, box_2):
+        width_of_overlap_area = min(box_1['xmax'], box_2['xmax']) - max(box_1['xmin'], box_2['xmin'])
+        height_of_overlap_area = min(box_1['ymax'], box_2['ymax']) - max(box_1['ymin'], box_2['ymin'])
+        if width_of_overlap_area < 0 or height_of_overlap_area < 0:
+            area_of_overlap = 0
+        else:
+            area_of_overlap = width_of_overlap_area * height_of_overlap_area
+        box_1_area = (box_1['ymax'] - box_1['ymin']) * (box_1['xmax'] - box_1['xmin'])
+        box_2_area = (box_2['ymax'] - box_2['ymin']) * (box_2['xmax'] - box_2['xmin'])
+        area_of_union = box_1_area + box_2_area - area_of_overlap
+        if area_of_union == 0:
+            return 0
+        return area_of_overlap / area_of_union
+
+
+    def sort_objects(self):
+        self.objects = sorted(self.objects, key=lambda obj : obj['confidence'], reverse=True)
+
+        for i in range(len(self.objects)):
+            if self.objects[i]['confidence'] == 0:
+                continue
+            for j in range(i + 1, len(self.objects)):
+                if self.intersection_over_union(self.objects[i], self.objects[j]) > self.IOU_THRESHOLD:
+                    self.objects[j]['confidence'] = 0
+
+    def parse_yolo_region(self, blob: 'np.ndarray', original_shape: list, params: dict) -> list:
+
+        # YOLO magic numbers
+        # See: https://github.com/opencv/open_model_zoo/blob/acf297c73db8cb3f68791ae1fad4a7cc4a6039e5/demos/python_demos/object_detection_demo_yolov3_async/object_detection_demo_yolov3_async.py#L61
+        num = 3
+        coords = 4
+        classes = 80
+        # -----------------
+
+        _, _, out_blob_h, out_blob_w = blob.shape
+        assert out_blob_w == out_blob_h, "Invalid size of output blob. It sould be in NCHW layout and height should " \
+                                         "be equal to width. Current height = {}, current width = {}" \
+                                         "".format(out_blob_h, out_blob_w)
+
+        # ------ Extracting layer parameters --
+        orig_im_h, orig_im_w = original_shape
+        predictions = blob.flatten()
+        side_square = params['side'] * params['side']
+
+        # ------ Parsing YOLO Region output --
+        for i in range(side_square):
+            row = i // params['side']
+            col = i % params['side']
+            for n in range(num):
+                # -----entry index calcs------
+                obj_index = self.entry_index(params['side'], coords, classes, n * side_square + i, coords)
+                # -----entry index calcs------
+                scale = predictions[obj_index]
+                if scale < self.PROB_THRESHOLD:
+                    continue
+                box_index = self.entry_index(params['side'], coords, classes, n * side_square + i, 0)
+
+                # Network produces location predictions in absolute coordinates of feature maps.
+                # Scale it to relative coordinates.
+                x = (col + predictions[box_index + 0 * side_square]) / params['side']
+                y = (row + predictions[box_index + 1 * side_square]) / params['side']
+                # Value for exp is very big number in some cases so following construction is using here
+                try:
+                    w_exp = exp(predictions[box_index + 2 * side_square])
+                    h_exp = exp(predictions[box_index + 3 * side_square])
+                except OverflowError:
+                    continue
+
+                w = w_exp * params['anchors'][2 * n] / 416
+                h = h_exp * params['anchors'][2 * n + 1] / 416
+                for j in range(classes):
+                    class_index = self.entry_index(params['side'], coords, classes, n * side_square + i,
+                                              coords + 1 + j)
+                    confidence = scale * predictions[class_index]
+                    if confidence < self.PROB_THRESHOLD:
+                        continue
+
+                    self.objects.append(self.scale_bbox(x=x,
+                                                        y=y,
+                                                        h=h,
+                                                        w=w,
+                                                        class_id=j,
+                                                        confidence=confidence,
+                                                        h_scale=orig_im_h,
+                                                        w_scale=orig_im_w))
+
+
+for detection in detections:
+    frame_number = detection['frame_id']
+    height = detection['frame_height']
+    width = detection['frame_width']
+    detection = detection['detections']
+
+    original_shape = (width, height)
+
+    resized_width = width / 416
+    resized_height = height / 416
+
+    resized_shape = (resized_width, resized_height)
+
+    # https://github.com/opencv/open_model_zoo/blob/master/demos/python_demos/object_detection_demo_yolov3_async/object_detection_demo_yolov3_async.py#L72
+    anchors = [10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326]
+    conv_6 = {'side': 13, 'mask': [6,7,8]}
+    conv_14 = {'side': 26, 'mask': [3,4,5]}
+    conv_22 = {'side': 52, 'mask': [0,1,2]}
+
+    yolo_params = {'detector/yolo-v3/Conv_6/BiasAdd/YoloRegion': conv_6,
+                   'detector/yolo-v3/Conv_14/BiasAdd/YoloRegion': conv_14,
+                   'detector/yolo-v3/Conv_22/BiasAdd/YoloRegion': conv_22}
+
+    for conv_net in yolo_params.values():
+        mask = conv_net['mask']
+        masked_anchors = []
+        for idx in mask:
+            masked_anchors += [anchors[idx * 2], anchors[idx * 2 + 1]]
+
+        conv_net['anchors'] = masked_anchors
+
+    parser = Parser()
+
+    for name, blob in detection.items():
+        parser.parse_yolo_region(blob, original_shape, yolo_params[name])
+
+    parser.sort_objects()
+
+    objects = []
+
+    for obj in parser.objects:
+        if obj['confidence'] >= parser.PROB_THRESHOLD:
+            label = obj['class_id']
+            xmin = obj['xmin']
+            xmax = obj['xmax']
+            ymin = obj['ymin']
+            ymax = obj['ymax']
+
+            results.add_box(xmax, ymax, xmin, ymin, label, frame_number)
@@ -0,0 +1,84 @@
+{
+    "label_map": {
+        "1": "person",
+        "2": "bicycle",
+        "3": "car",
+        "4": "motorbike",
+        "5": "aeroplane",
+        "6": "bus",
+        "7": "train",
+        "8": "truck",
+        "9": "boat",
+        "10": "traffic light",
+        "11": "fire hydrant",
+        "12": "stop sign",
+        "13": "parking meter",
+        "14": "bench",
+        "15": "bird",
+        "16": "cat",
+        "17": "dog",
+        "18": "horse",
+        "19": "sheep",
+        "20": "cow",
+        "21": "elephant",
+        "22": "bear",
+        "23": "zebra",
+        "24": "giraffe",
+        "25": "backpack",
+        "26": "umbrella",
+        "27": "handbag",
+        "28": "tie",
+        "29": "suitcase",
+        "30": "frisbee",
+        "31": "skis",
+        "32": "snowboard",
+        "33": "sports ball",
+        "34": "kite",
+        "35": "baseball bat",
+        "36": "baseball glove",
+        "37": "skateboard",
+        "38": "surfboard",
+        "39": "tennis racket",
+        "40": "bottle",
+        "41": "wine glass",
+        "42": "cup",
+        "43": "fork",
+        "44": "knife",
+        "45": "spoon",
+        "46": "bowl",
+        "47": "banana",
+        "48": "apple",
+        "49": "sandwich",
+        "50": "orange",
+        "51": "broccoli",
+        "52": "carrot",
+        "53": "hot dog",
+        "54": "pizza",
+        "55": "donut",
+        "56": "cake",
+        "57": "chair",
+        "58": "sofa",
+        "59": "pottedplant",
+        "60": "bed",
+        "61": "diningtable",
+        "62": "toilet",
+        "63": "tvmonitor",
+        "64": "laptop",
+        "65": "mouse",
+        "66": "remote",
+        "67": "keyboard",
+        "68": "cell phone",
+        "69": "microwave",
+        "70": "oven",
+        "71": "toaster",
+        "72": "sink",
+        "73": "refrigerator",
+        "74": "book",
+        "75": "clock",
+        "76": "vase",
+        "77": "scissors",
+        "78": "teddy bear",
+        "79": "hair drier",
+        "80": "toothbrush"
+    }
+}