First stable version (#1)

* use a median filter to smooth the output heat map * add DLA Lite version; fix a critical bug in calculating accuracy * remove the smooth colvolve due to extremely slow execution time * Get centroid, bounding box and probability * replace the block model with function; fix a bug in train * wrap the post-processing into saved model
biendltb · Dec 12, 2019 · b06221e · b06221e
1 parent 8b8bac7
commit b06221e
Show file tree

Hide file tree

Showing 7 changed files with 1,274 additions and 31 deletions.
diff --git a/src/models/centernet.py b/src/models/centernet.py
@@ -6,7 +6,7 @@
 
 
 class Centernet:
-    def __init__(self, dataset_fn: Callable = load_all_ds, network_fn: Callable = dla.dla_net,
+    def __init__(self, dataset_fn: Callable = load_all_ds, network_fn: Callable = dla.dla_lite_net,
                  lr: float = 1e-4,
                  dataset_args: Dict = None):
 
@@ -25,9 +25,13 @@ def forward_pass(self, thermal_mat, heat_map):
         # use L2 loss
         loss = tf.reduce_mean(tf.square(out_map - heat_map))
 
+        # smooth the output heat map
+        # median_filter = tf.ones((3, 3, 1, 1)) * 1/9
+        # out_map = tf.nn.conv2d(out_map, median_filter, strides=1, padding='SAME')
+
         # calculate the l2 distance to the ground truth centroid
-        gt_pnt = tf.unravel_index(tf.math.argmax(tf.reshape(heat_map, [-1])), tf.cast(tf.shape(heat_map), tf.int64))
-        out_pnt = tf.unravel_index(tf.math.argmax(tf.reshape(out_map, [-1])), tf.cast(tf.shape(out_map), tf.int64))
-        dist = tf.linalg.norm(tf.cast(out_pnt - gt_pnt, tf.float64))
+        gt_pnt = tf.unravel_index(tf.math.argmax(tf.reshape(heat_map, [heat_map.shape[0], -1]), axis=1), tf.cast(tf.shape(heat_map), tf.int64)[1:3])
+        out_pnt = tf.unravel_index(tf.math.argmax(tf.reshape(out_map, [out_map.shape[0], -1]), axis=1), tf.cast(tf.shape(out_map), tf.int64)[1:3])
+        dist = tf.reduce_mean(tf.math.sqrt(tf.cast(tf.reduce_sum(tf.math.square(out_pnt - gt_pnt), axis=0), tf.float32)))
 
         return loss, dist
diff --git a/src/models/eval.py b/src/models/eval.py
@@ -2,33 +2,40 @@
 import h5py
 import numpy as np
 import cv2
+import time
 
-from src.networks.dla import dla_net
+from src.networks import dla
 from src.utils import helpers
 
 
 class ThermalEval:
     def __init__(self):
-        self.model = dla_net()
+        self.model = dla.dla_lite_net(mode='eval')
 
     def load_ckpt(self, ckpt_path):
         checkpoint = tf.train.Checkpoint(model=self.model)
         checkpoint.restore(ckpt_path)
 
     def infer_frame(self, thermal_frame):
-        heat_map = self.model.predict(thermal_frame)
+        outs = self.model.predict(thermal_frame, batch_size=1)[0]
 
-        _map = heat_map[0, :, :, 0]
+        # _map = outs[0, :, :, 0]
+        # outs = helpers.heatmap_to_point(_map)
 
-        key_point = helpers.heatmap_to_point(_map)
+        max_y, max_x, bb_h, bb_w, prob = outs
+        max_y, max_x = int(max_y), int(max_x)
 
-        return key_point
+        print(prob)
+
+        return (max_y, max_x), (bb_h, bb_w), prob
 
     def infer_video(self, thermal_path):
         with h5py.File(thermal_path) as f:
             # get number of databases - frame count database
             n_thermal_frames = len(f.keys()) - 1
 
+            exec_time = []
+
             # grab thermal and visual frames one-by-one for processing
             for i in range(n_thermal_frames):
                 key = 'frame{}'.format(i)
@@ -39,23 +46,48 @@ def infer_video(self, thermal_path):
                 thermal_frame = np.expand_dims(thermal_frame, axis=-1)
                 thermal_frame = np.expand_dims(thermal_frame, axis=0)
 
-                key_point = self.infer_frame(thermal_frame)
+                start = time.time()
+                key_point, bb_size, prob = self.infer_frame(thermal_frame)
+                exec_time.append(time.time()-start)
 
                 vis_frame = cv2.cvtColor(thermal_gray, cv2.COLOR_GRAY2BGR)
-                vis_frame[key_point] = (0, 0, 255)
+
+                if prob > 0.1:
+                    vis_frame[key_point] = (0, 255, 0)
+
+                    # draw bounding boxes
+                    bb_h, bb_w = bb_size
+                    kp_y, kp_x = key_point
+
+                    x1, y1 = int(round(kp_x - bb_w / 2)), int(round(kp_y - bb_h / 2))
+                    x2, y2 = int(round(x1 + bb_w)), int(round(y1 + bb_h))
+                    vis_frame = cv2.rectangle(vis_frame, (x1, y1), (x2, y2), (0, 0, 255))
+
                 scale = 4
                 vis_frame = cv2.resize(vis_frame, (vis_frame.shape[1] * scale, vis_frame.shape[0] * scale))
 
                 cv2.imshow('test', vis_frame)
                 if cv2.waitKey(1) & 0xFF == ord('q'):  # 5 ms
                     break
+            print('Average execution time: {:.2f}ms | {:.2f} fps'.format(np.mean(exec_time) * 1000, 1/np.mean(exec_time)))
+
+    def save_model(self, save_path):
+        self.model.save(save_path, save_format='tf')
 
 
 if __name__ == '__main__':
     thermal_video = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/HD/thermal/anhdnt/set1_0.hdf5'
-    ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_1/ckpts/ckpt-9'
+    # ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_1/ckpts/ckpt-9'
+    # ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_3/ckpts/ckpt-24'
+    # ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_6/ckpts/ckpt-8'
+    # ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_7/ckpts/ckpt-20'
+    # ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_8/ckpts/ckpt-20'
+    ckpt_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_9/ckpts/ckpt-12'
+
     thermal_model = ThermalEval()
     thermal_model.load_ckpt(ckpt_path)
 
     thermal_model.infer_video(thermal_video)
 
+    # save_path = '/media/biendltb/6e1ef38e-db2f-4eda-ad11-31252df3b87b/data/model_gym/centernet/model_9/export/'
+    # thermal_model.save_model(save_path)
diff --git a/src/models/train.py b/src/models/train.py
@@ -10,6 +10,7 @@
 from src.models.centernet import Centernet
 from src.utils.path_cvt import get_path_to_vis_ims, get_path_to_ckpts
 from src.datasets.thermal_dataset import load_vis_data
+from src.utils import helpers
 
 # -------> For RTX NVIDIA GPU only
 from tensorflow.compat.v1 import ConfigProto
@@ -134,7 +135,13 @@ def generate_and_save_images(self, epoch):
         plt.figure(figsize=(9, 9))
 
         for i in range(predictions.shape[0]):
-            fig = predictions[i, :, :, 0] + self.visualized_eval_images[i, :, :, 0]
+            h_map = predictions[i, :, :, 0]
+            fig = h_map + self.visualized_eval_images[i, :, :, 0]
+
+            # plot the keypoint on image
+            keypoint, bb_size, _ = helpers.heatmap_to_point(h_map)
+            fig[keypoint] = 0
+
             cmap = plt.cm.viridis
             norm = plt.Normalize(vmin=fig.min(), vmax=fig.max())
             image = cmap(norm(fig))
@@ -149,8 +156,8 @@ def generate_and_save_images(self, epoch):
 
 if __name__ == '__main__':
     trainer = ModelTrain(
-        epochs=5000,
-        batch_size=2,
+        epochs=500,
+        batch_size=32,
         use_wandb=True
     )
 

diff --git a/src/networks/dla.py b/src/networks/dla.py
@@ -36,7 +36,7 @@ def _max_pooling(x, pool_size, strides):
 
 
 def _avg_pooling(x, pool_size, strides):
-    return tf.keras.layers.AveragePooling2D(pool_size=pool_size, strides=strides)(x)
+    return tf.keras.layers.AveragePooling2D(pool_size=pool_size, strides=strides, padding='same')(x)
 
 
 class BasicBlock(tf.keras.Model):
@@ -78,11 +78,34 @@ def call(self, x):
         return x
 
 
+def _basic_block(x, filters, kernel_size=3, strides=1):
+    input_filters = x.shape[3]
+
+    _tmp_conv = tf.keras.layers.Conv2D(filters=filters, kernel_size=1, strides=1,
+                                                      padding='same', use_bias=False)
+
+    # if input and the block have different number of filters, use one more conv layer to equalise it
+    residual = tf.cond(tf.equal(input_filters, filters),
+                       lambda: x,
+                       lambda: _tmp_conv(x))
+
+    x = _conv(x, filters=filters, kernel_size=kernel_size)
+
+    x = tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, strides=strides,
+                               padding='same', use_bias=False)(x)
+    x = tf.keras.layers.BatchNormalization()(x)
+
+    x += residual
+    x = tf.keras.layers.ReLU()(x)
+
+    return x
+
+
 # modified from Stick-To
 def _dla_generator(bottom, filters, levels):
     if levels == 1:
-        block1 = BasicBlock(filters=filters)(bottom)
-        block2 = BasicBlock(filters=filters)(block1)
+        block1 = _basic_block(bottom, filters)  # BasicBlock(filters=filters)(bottom)
+        block2 = _basic_block(block1, filters)  # BasicBlock(filters=filters)(block1)
         aggregation = block1 + block2
         aggregation = _conv(aggregation, filters, kernel_size=3)
     else:
@@ -156,6 +179,110 @@ def dla_net():
     return model
 
 
+def heatmap_to_point(heatmaps_tensor, batch_size=1):
+    """ Convert the heat map to point and bounding box in Tensorflow
+        Input tensor shape: batch_size * h * w * channel
+    """
+
+    gaussian_kernel = tf.constant([
+        [1, 2, 1],
+        [2, 4, 2],
+        [1, 2, 1]
+    ], dtype=tf.float32) / 16.0
+
+    filters = gaussian_kernel[:, :, tf.newaxis, tf.newaxis]
+
+    original_tensor = heatmaps_tensor
+
+    heatmaps_tensor = tf.nn.conv2d(heatmaps_tensor, filters, strides=1, padding="SAME")
+
+    h, w = heatmaps_tensor.shape[1:3]
+
+    max_x = tf.math.argmax(tf.math.reduce_sum(heatmaps_tensor, axis=1), axis=1, output_type=tf.int32)[:, 0]
+    max_y = tf.math.argmax(tf.math.reduce_sum(heatmaps_tensor, axis=2), axis=1, output_type=tf.int32)[:, 0]
+
+    # probs = tf.gather_nd(original_tensor, tf.stack([tf.range(batch_size), max_y, max_x, tf.zeros_like(max_y)], axis=-1))
+    probs = tf.stack(
+        [tf.reduce_max(tf.slice(original_tensor, [i, max_y[i] - 2, max_x[i] - 2, 0], [1, 5, 5, 1])) for i in
+         range(batch_size)])
+    probs = tf.clip_by_value(probs, clip_value_min=0, clip_value_max=0.99999)
+
+    pos_diff_h = tf.cast(
+        tf.math.square(
+            (tf.tile(tf.expand_dims(tf.range(h), axis=0), [batch_size, 1]) - tf.tile(tf.expand_dims(max_y, -1),
+                                                                                     [1, h])) / (h - 1)
+        ),
+        tf.float32
+    )
+    bb_h = tf.reduce_mean(tf.sqrt(
+        abs((pos_diff_h / (2.0 * tf.math.log(tf.stack([heatmaps_tensor[i, :, max_x[i], 0] for i in range(batch_size)])))))),
+        axis=1) * 2 * h
+
+    pos_diff_w = tf.cast(
+        tf.math.square(
+            (tf.tile(tf.expand_dims(tf.range(w), axis=0), [batch_size, 1]) - tf.tile(tf.expand_dims(max_x, -1),
+                                                                                     [1, w])) / (w - 1)
+        ),
+        tf.float32
+    )
+    bb_w = tf.reduce_mean(tf.sqrt(
+        abs((pos_diff_w / (2.0 * tf.math.log(tf.stack([heatmaps_tensor[i, max_y[i], :, 0] for i in range(batch_size)])))))),
+        axis=1) * 2 * w
+
+    out = tf.stack([tf.cast(max_y, tf.float32), tf.cast(max_x, tf.float32), bb_h, bb_w, probs], axis=-1)
+
+    return out
+
+
+def dla_lite_net(mode='train'):
+    base_filters = 8
+    # channel last; None -> grayscale or color images
+    inputs = tf.keras.layers.Input(shape=INPUT_SHAPE, name='thermal_frame')
+
+    x = _conv(inputs, base_filters, 7)
+    stage1 = _conv(x, base_filters * 2, 3)
+    stage2 = _conv(stage1, base_filters * 2, 3, strides=2)  # 1/2
+
+    # stage 3
+    dla_stage3 = _dla_generator(stage2, base_filters * 4, levels=1)
+    dla_stage3 = _max_pooling(dla_stage3, 2, 2)  # 1/4
+
+    # stage 4
+    dla_stage4 = _dla_generator(dla_stage3, base_filters * 8, levels=2)
+    dla_stage4 = _max_pooling(dla_stage4, 2, 2)  # 1/8
+    residual = _conv(dla_stage3, base_filters * 8, 1)
+    residual = _avg_pooling(residual, 2, 2)  # 1/8
+    dla_stage4 += residual
+
+    dla_stage4 = _conv(dla_stage4, base_filters * 16, 1)
+    dla_stage4_3 = _dconv(dla_stage4, base_filters * 8, 4, 2)  # 1/4
+
+    dla_stage3 = _conv(dla_stage3, base_filters * 8, 1)
+    dla_stage3_3 = _conv(dla_stage3 + dla_stage4_3, base_filters * 8, 3)
+    dla_stage3_3 = _dconv(dla_stage3_3, base_filters * 4, 4, 2)  # 1/2
+
+    stage2 = _conv(stage2, base_filters * 4, 1)
+    stage2 = _conv(stage2 + dla_stage3_3, base_filters * 4, 1)
+    stage2 = _dconv(stage2, base_filters * 2, 4, 2)
+
+    stage1 = _conv(stage1, base_filters * 2, 1)
+    stage1 = _conv(stage1 + stage2, base_filters * 2, 1)
+
+    features = _conv(stage1, base_filters * 1, 1)
+
+    # separate to multiple output heads
+    keypoints = _conv(features, NUM_CLASS, 3)
+    # size = _conv(features, 2, 3, 1)
+
+    if mode == 'train':
+        model = tf.keras.Model(inputs=inputs, outputs=keypoints)
+    else:
+        out = tf.keras.layers.Lambda(lambda hmap: heatmap_to_point(hmap), name='thermal_output')(keypoints)
+        model = tf.keras.Model(inputs=inputs, outputs=out)
+
+    return model
+
+