From ed735ca4c51901f72e0ecf22b6a96720f0492f9b Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sun, 6 May 2018 21:08:02 +0800 Subject: [PATCH 01/31] Update ssd_vgg_300.py --- nets/ssd_vgg_300.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nets/ssd_vgg_300.py b/nets/ssd_vgg_300.py index 46c09bf5..4a09b5db 100644 --- a/nets/ssd_vgg_300.py +++ b/nets/ssd_vgg_300.py @@ -188,6 +188,7 @@ def anchors(self, img_shape, dtype=np.float32): self.params.anchor_offset, dtype) + #使用ssd的anchors编码ground truth的label和bbox,对所有的特征层编码box def bboxes_encode(self, labels, bboxes, anchors, scope=None): """Encode labels and bounding boxes. From a3c55bbedd4bb1a8a7592e4986b3762503ae6554 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sun, 6 May 2018 21:52:07 +0800 Subject: [PATCH 02/31] Update ssd_vgg_300.py --- nets/ssd_vgg_300.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/nets/ssd_vgg_300.py b/nets/ssd_vgg_300.py index 4a09b5db..7b5f9d1d 100644 --- a/nets/ssd_vgg_300.py +++ b/nets/ssd_vgg_300.py @@ -91,6 +91,7 @@ class SSDNet(object): conv11 ==> 1 x 1 The default image size used to train this network is 300x300. """ + #注意每个参数多代表的含义,注意这儿的anchor_size_bounds和anchor_sizes不匹配,因为根据anchor_sizes的比例,我们可以得到Smin和Smax为[0.07,0.87] default_params = SSDParams( img_shape=(300, 300), num_classes=21, @@ -117,6 +118,7 @@ class SSDNet(object): [2, .5, 3, 1./3], [2, .5], [2, .5]], + #还有anchor_steps代表的每一个feature map中的grid_cell相对于原图的像素比例,例如conv4_3为(38,38),300/38.0=7.89~=8 anchor_steps=[8, 16, 32, 64, 100, 300], anchor_offset=0.5, normalizations=[20, -1, -1, -1, -1, -1], @@ -303,7 +305,10 @@ def ssd_feat_shapes_from_net(predictions, default_shapes=None): feat_shapes.append(shape) return feat_shapes - +#这个函数可以得到每层的feature map中的每个feature map cell(简称为fmc)相对于原图的中心点的坐标 +#和相对于当前feature map的宽和高!!!注意h和w都是相对于当前feature map的!!! +#可以得到(y/x).shape--->(g_c,g_c),(w/h).shape--->(n_g_c,),别担心,后面通过tensor的操作可以转化为(g_c,g_c,n_g_c),这里假定‘后feature map’ +#的w和h相同,都为g_c,n_g_c代表当前这层所设置的每个grid_cell所对应的anchor的数量! def ssd_anchor_one_layer(img_shape, feat_shape, sizes, @@ -358,7 +363,8 @@ def ssd_anchor_one_layer(img_shape, w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r) return y, x, h, w - +#通过list的append操作,我们可以得到所有层的anchors的中心点坐标和相对当前“后feature map”的宽高! +#layers_anchors=[(N*38*38*4),(N*19*19*6),(N*10*10*6),(N*5*5*6),(N*3*3*4),(N*1*1*4)],list中我们标示的是tensor所对应的shape! def ssd_anchors_all_layers(img_shape, layers_shape, anchor_sizes, @@ -398,7 +404,7 @@ def tensor_shape(x, rank=3): return [s if s is not None else d for s, d in zip(static_shape, dynamic_shape)] - +#对conv4_3,conv7,conv8,conv9,conv10,conv11层分别进行再次3*3的conv操作,可以将其转化为 def ssd_multibox_layer(inputs, num_classes, sizes, @@ -415,12 +421,18 @@ def ssd_multibox_layer(inputs, # Location. num_loc_pred = num_anchors * 4 + #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作,注意filter的输出, + #这样我们可以转化为(N,g_c,g_c,nlp),注意nlp所代表的含义,即得到对应层的坐标预测输出!!! + #拿conv4_3举例,得到的feature map的shape为(N,38,38,256),这样转化之后可得为(N,38,38,4*4)~ loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None, scope='conv_loc') loc_pred = custom_layers.channel_to_last(loc_pred) loc_pred = tf.reshape(loc_pred, tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4]) # Class prediction. + #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作, + #注意filter的输出,这样我们可以转化为(N,g_c,g_c,ncp),注意ncp所代表的含义,即得到对应曾的分类预测输出! + #拿conv4_3举例,得到的feature map的shape为(N,38,38,256),这样转化之后可得为(N,38,38,4*21)~ num_cls_pred = num_anchors * num_classes cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None, scope='conv_cls') @@ -505,6 +517,12 @@ def ssd_net(inputs, end_points[end_point] = net # Prediction and localisations layers. + #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作, + #得到对应层的分类预测和回归预测! + #这样可以得到predictions/logits的为 + #[(n,38,38,4*21),(n,19,19,6*21),(n,10,10,6*21),(n,5,5,6*21),(n,3,3,4*21),(n,1,1,4*21)],list中的每个元素代表的该tensor的shape + #localisations输出为 + #[(n,38,38,4*4),(n,19,19,6*4),(n,10,10,6*4),(n,5,5,6*4),(n,3,3,4*4),(n,1,1,4*4)],list中的每个元素代表的该tensor的shape predictions = [] logits = [] localisations = [] @@ -576,6 +594,7 @@ def ssd_arg_scope_caffe(caffe_scope): # =========================================================================== # # SSD loss function. # =========================================================================== # +#暂时还没看懂这一块,看懂了再更新!!! def ssd_losses(logits, localisations, gclasses, glocalisations, gscores, match_threshold=0.5, From cfa0eb464cf8820a5c4ace0c5a9967023d34e324 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sun, 6 May 2018 22:10:20 +0800 Subject: [PATCH 03/31] Update ssd_vgg_300.py --- nets/ssd_vgg_300.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/nets/ssd_vgg_300.py b/nets/ssd_vgg_300.py index 7b5f9d1d..c867b004 100644 --- a/nets/ssd_vgg_300.py +++ b/nets/ssd_vgg_300.py @@ -614,16 +614,24 @@ def ssd_losses(logits, localisations, fgscores = [] flocalisations = [] fglocalisations = [] + #我们已经看过了上面的logits的输出,现在我们来看看loss中怎么进行处理的! + #因为logits/localisations这个list中有6个tensor,对应了6个不同层的预测/分类输出, + #这样没法处理,所以我们先进行flatten,而后concat,方便进行处理! for i in range(len(logits)): + #reshape之后,flogits中分别得到的shape为(N*5776,21),(N*1444,21),(N*600,21),(N*150,21),(N*36,21),(N*4,21) + #5776=38*38*4,即将logits[i] reshape成了shape[:-1],21 flogits.append(tf.reshape(logits[i], [-1, num_classes])) fgclasses.append(tf.reshape(gclasses[i], [-1])) fgscores.append(tf.reshape(gscores[i], [-1])) + #reshape之后,flocalisations中分别得到的shape为(N*5776,4),(N*1444,4),(N*600,4),(N*150,4),(N*36,4),(N*4,4) flocalisations.append(tf.reshape(localisations[i], [-1, 4])) fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4])) # And concat the crap! + #然后我们进行concat操作,这样就可以得到logits的shape为(8732*N,21) logits = tf.concat(flogits, axis=0) gclasses = tf.concat(fgclasses, axis=0) gscores = tf.concat(fgscores, axis=0) + #localisations的shape为(8732*N,4) localisations = tf.concat(flocalisations, axis=0) glocalisations = tf.concat(fglocalisations, axis=0) dtype = logits.dtype From 5cb113114f50f67bb70372efbef31773ef4358f9 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sun, 6 May 2018 23:21:32 +0800 Subject: [PATCH 04/31] Update train_ssd_network.py --- train_ssd_network.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/train_ssd_network.py b/train_ssd_network.py index c93e2b4a..6164201b 100644 --- a/train_ssd_network.py +++ b/train_ssd_network.py @@ -229,6 +229,8 @@ def main(_): common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. + #slim.dataset_data_provider.DatasetDataProvider解读,https://blog.csdn.net/weixin_35653315/article/details/71023596 + #一次只返回一个img,需要组成batch进行训练!!! [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) From 357d1c5615e06550de0e9869486c64aa11f71def Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sun, 6 May 2018 23:24:20 +0800 Subject: [PATCH 05/31] Update train_ssd_network.py --- train_ssd_network.py | 1 + 1 file changed, 1 insertion(+) diff --git a/train_ssd_network.py b/train_ssd_network.py index 6164201b..9773d156 100644 --- a/train_ssd_network.py +++ b/train_ssd_network.py @@ -240,6 +240,7 @@ def main(_): out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. + #注意此时仅仅只送入了一张图片进去,only one!!! gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 From 118b6a5406aeb5d1447a84ecc14dfeadbb44a942 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sun, 6 May 2018 23:26:27 +0800 Subject: [PATCH 06/31] Update ssd_common.py --- nets/ssd_common.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/nets/ssd_common.py b/nets/ssd_common.py index 7de1a7e7..779448d4 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -44,7 +44,10 @@ def tf_ssd_bboxes_encode_layer(labels, (target_labels, target_localizations, target_scores): Target Tensors. """ # Anchors coordinates and volume. + #这儿我们拿conv4_3层的参数进行举例,方便理解 + #yref.shape:(38,38),xref.shape:(38,38),href.shape:(4,),wref.shape:(4,) yref, xref, href, wref = anchors_layer + #这样经过运算,这下我们得到的就是(y_min/y_max).shape:(38,38,4),(x_min/x_max).shape:(38,38,4) ymin = yref - href / 2. xmin = xref - wref / 2. ymax = yref + href / 2. @@ -52,6 +55,7 @@ def tf_ssd_bboxes_encode_layer(labels, vol_anchors = (xmax - xmin) * (ymax - ymin) # Initialize tensors... + #同样拿conv4_3层进行举例,可得shape=(38,38,4) shape = (yref.shape[0], yref.shape[1], href.size) feat_labels = tf.zeros(shape, dtype=tf.int64) feat_scores = tf.zeros(shape, dtype=dtype) @@ -61,6 +65,7 @@ def tf_ssd_bboxes_encode_layer(labels, feat_ymax = tf.ones(shape, dtype=dtype) feat_xmax = tf.ones(shape, dtype=dtype) + #同样拿conv4_3举例,经过运算,我们得到的jaccard矩阵的shape为(38,38,4),它得到的值在(0.0~1.0)之间,代表anchor和ground truth box的iou def jaccard_with_anchors(bbox): """Compute jaccard score between a box and the anchors. """ @@ -100,23 +105,33 @@ def condition(i, feat_labels, feat_scores, def body(i, feat_labels, feat_scores, feat_ymin, feat_xmin, feat_ymax, feat_xmax): """Body: update feature labels, scores and bboxes. + #意思是当iou大于0.5的时候,我们就对其进行赋值,但是什么时候才更新呢,直到该anchor与所有ground truth box求得的最大的值进行赋值! Follow the original SSD paper for that purpose: - assign values when jaccard > 0.5; - only update if beat the score of other bboxes. """ # Jaccard score. + #注意此时代表的是一张图片中的几个ground truth,只有几个啦,然后一个anchor就可以对应一个ground truth或者不对应,一个ground truth可以对应多个anchor! + #label代表当前的ground truth box的label,因为cond,body的设置,我们可以看到,是用所有的anchors和第i个ground truth box进行 + #计算,最后得到feat_labels,feat_locations,feat_scores label = labels[i] bbox = bboxes[i] jaccard = jaccard_with_anchors(bbox) # Mask: check threshold + scores + no annotations + num_classes. + # mask.shape:(gc,gc,n_gc),bool型 mask = tf.greater(jaccard, feat_scores) # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold)) mask = tf.logical_and(mask, feat_scores > -0.5) mask = tf.logical_and(mask, label < num_classes) + #这两个分别是干什么的???一个tf.int64,可以看做xijp,那么另一个tf.float32呢,也就是fmask是干什么的东东? + #int型shape:(gc,gc,n_gc) imask = tf.cast(mask, tf.int64) + #float型shape:(gc,gc,n_gc) fmask = tf.cast(mask, dtype) # Update values using mask. + #注意针对feat_labels或者feat_scores的更新,我们是不断迭代完成的!!! feat_labels = imask * label + (1 - imask) * feat_labels + #注意tf.where函数的用法,https://blog.csdn.net/qq_19332527/article/details/78671280 feat_scores = tf.where(mask, jaccard, feat_scores) feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin @@ -142,11 +157,13 @@ def body(i, feat_labels, feat_scores, feat_ymin, feat_xmin, feat_ymax, feat_xmax]) # Transform to center / size. + #计算补偿后的中心 feat_cy = (feat_ymax + feat_ymin) / 2. feat_cx = (feat_xmax + feat_xmin) / 2. feat_h = feat_ymax - feat_ymin feat_w = feat_xmax - feat_xmin # Encode features. + feat_cy = (feat_cy - yref) / href / prior_scaling[0] feat_cx = (feat_cx - xref) / wref / prior_scaling[1] feat_h = tf.log(feat_h / href) / prior_scaling[2] From 390d18d0f061cd0b5ce9357945b5834947628927 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Mon, 7 May 2018 23:34:18 +0800 Subject: [PATCH 07/31] Update ssd_common.py --- nets/ssd_common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nets/ssd_common.py b/nets/ssd_common.py index 779448d4..d89d4a60 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -134,6 +134,9 @@ def body(i, feat_labels, feat_scores, #注意tf.where函数的用法,https://blog.csdn.net/qq_19332527/article/details/78671280 feat_scores = tf.where(mask, jaccard, feat_scores) + #fx=t*b+(1-t)*fx + #这种设置的原因在于,右边的fx(feat_**)代表的是以往的信息 + #即代表 feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax From 16720f9c17f9ca5587e658f80a179281c394b2fd Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 8 May 2018 15:20:21 +0800 Subject: [PATCH 08/31] Update ssd_vgg_preprocessing.py --- preprocessing/ssd_vgg_preprocessing.py | 28 +++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/preprocessing/ssd_vgg_preprocessing.py b/preprocessing/ssd_vgg_preprocessing.py index 413ad342..16d46a52 100644 --- a/preprocessing/ssd_vgg_preprocessing.py +++ b/preprocessing/ssd_vgg_preprocessing.py @@ -119,7 +119,12 @@ def apply_with_random_selector(x, func, num_cases): func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) for case in range(num_cases)])[0] +#通过下面的两种操作,程序可以通过一张训练图衍生出许多训练样本,通过将训练图像进行预处理,训练得到的神经网络模型可以识别不同大小,方位,色彩等方面的实体。 +#操作包括:distort_color和distorted_bounding_box_crop两个操作 +#通过下面两种设置然后我们将可以得到很多的训练样本,要不真是的训练样本实际是不够的! +#distort_color用于随机的调整图像的色彩,调整亮度,对比度,饱和色相的顺序会影响最后的结果, +#定义多种不同的顺序,随机选择,可以进一步降低无关因素对模型的影响 def distort_color(image, color_ordering=0, fast_mode=True, scope=None): """Distort the color of a Tensor image. @@ -173,6 +178,9 @@ def distort_color(image, color_ordering=0, fast_mode=True, scope=None): return tf.clip_by_value(image, 0.0, 1.0) + +#对图片进行预处理,将图片转化成神经网络的输入层数据。 +#表示随机裁剪图片,仅仅生成一个cropped_image def distorted_bounding_box_crop(image, labels, bboxes, @@ -182,11 +190,14 @@ def distorted_bounding_box_crop(image, max_attempts=200, clip_bboxes=True, scope=None): - """Generates cropped_image using a one of the bboxes randomly distorted. + """ + #注意这个函数的解析表明的是使用其中一个随机扭曲的bbox生成cropped_image + Generates cropped_image using a one of the bboxes randomly distorted. See `tf.image.sample_distorted_bounding_box` for more documentation. Args: + #观察这些参数可以发现,这个函数实际是对一张图片的多个gt bbox中随机选择一个(暂时理解为ground truth bbox)进行随机扭曲,返回cropped_image和bbox等等 image: 3-D Tensor of image (it will be converted to floats in [0, 1]). bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged @@ -203,12 +214,21 @@ def distorted_bounding_box_crop(image, region of the image of the specified constraints. After `max_attempts` failures, return the entire image. scope: Optional scope for name_scope. + #注意返回值是一个tuple,分别是cropped_image和distorted bbox,这个我们主要参考tf.image.sample_distorted_bounding_box的实现 Returns: A tuple, a 3-D Tensor cropped_image and the distorted bbox """ with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]): # Each bounding box has shape [1, num_boxes, box coords] and # the coordinates are ordered [ymin, xmin, ymax, xmax]. + # 为什么要用sample_distorted_bounding_box的原因在于可以随机的截取图片中一个块,减小需要关注的物体大小对图像识别算法的影响 + # tf.image.sample_distorted_bounding_box的讲解主要参考:https://blog.csdn.net/tz_zs/article/details/77920116 + # 需要注意的是,返回值的类型为: + # begin: 和 image_size 具有相同的类型。包含 [offset_height, offset_width, 0] 的一维数组。作为 tf.slice 的输入。 + # size: 和 image_size 具有相同的类型。包含 [target_height, target_width, -1] 的一维数组。作为 tf.slice 的输入。 + # 根据begin,size两个参数我们可以tf.slice出来我们所需要的裁剪出来的小图,而bboxes主要用于在图像上面的显示bbox工作!!! + # 那么为什么bboxes的shape为[1,1,4]呢?是不是因为tf.image.sample_distorted_bounding_box函数仅裁出来了一个bbox呢? + # bboxes:shape为 [1, 1, 4] 的三维矩阵,数据类型为float32,表示随机变形后的边界框。作为 tf.image.draw_bounding_boxes 的输入。 bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=tf.expand_dims(bboxes, 0), @@ -220,6 +240,8 @@ def distorted_bounding_box_crop(image, distort_bbox = distort_bbox[0, 0] # Crop the image to the specified bounding box. + # 注意tf.slice中的begin参数和size参数,begin.shape[-1]=0,size.shape[-1]=-1,可以由tf.image.sample_distorted_bouning_box中确定, + # 然后我们就可以从图像中裁剪我们所期望的小图! cropped_image = tf.slice(image, bbox_begin, bbox_size) # Restore the shape since the dynamic slice loses 3rd dimension. cropped_image.set_shape([None, None, 3]) @@ -229,9 +251,13 @@ def distorted_bounding_box_crop(image, labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes, threshold=BBOX_CROP_OVERLAP, assign_negative=False) + #注意我们的返回值cropped_image的shape为[None,None,3],不用担心,在后面preprocess_for_train中我们会怎样呢,对了没错, + #resize到ssd所需要的大小,所以不用担心哈! return cropped_image, labels, bboxes, distort_bbox +#实际上preprocess_for_train的原因在与缺少训练样本,我们进行处理之后可以增加训练样本,用于训练! +#具体这一块可以参考:TensorFlow图像预处理完整样例 https://blog.8hfq.com/?p=455博客,有详细的记录! def preprocess_for_train(image, labels, bboxes, out_shape, data_format='NHWC', scope='ssd_preprocessing_train'): From a07b96c23e0486608783dcf0979cfdf66627805d Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 8 May 2018 15:44:08 +0800 Subject: [PATCH 09/31] Update ssd_vgg_preprocessing.py --- preprocessing/ssd_vgg_preprocessing.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/preprocessing/ssd_vgg_preprocessing.py b/preprocessing/ssd_vgg_preprocessing.py index 16d46a52..43adcc4b 100644 --- a/preprocessing/ssd_vgg_preprocessing.py +++ b/preprocessing/ssd_vgg_preprocessing.py @@ -258,6 +258,7 @@ def distorted_bounding_box_crop(image, #实际上preprocess_for_train的原因在与缺少训练样本,我们进行处理之后可以增加训练样本,用于训练! #具体这一块可以参考:TensorFlow图像预处理完整样例 https://blog.8hfq.com/?p=455博客,有详细的记录! +#对训练集的预处理! def preprocess_for_train(image, labels, bboxes, out_shape, data_format='NHWC', scope='ssd_preprocessing_train'): @@ -265,7 +266,7 @@ def preprocess_for_train(image, labels, bboxes, Note that the actual resizing scale is sampled from [`resize_size_min`, `resize_size_max`]. - + #注意底下所给的参数和上面提供的参数不一致,因此我们在程序中关注它的实际参数就可以了! Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. @@ -299,15 +300,19 @@ def preprocess_for_train(image, labels, bboxes, min_object_covered=MIN_OBJECT_COVERED, aspect_ratio_range=CROP_RATIO_RANGE) # Resize image to output size. + #因为distorted_bounding_box_crop返回的图像我们都已经set_shape为了[None,None,3],我们需要将其调整为网络所需要的输入大小, + #所以统一resize为out_shape大小!!! dst_image = tf_image.resize_image(dst_image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) tf_summary_image(dst_image, bboxes, 'image_shape_distorted') # Randomly flip the image horizontally. + #随机左右翻转图像 dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes) # Randomly distort the colors. There are 4 ways to do it. + # 使用一种随机的顺序调整图像的色彩!!! dst_image = apply_with_random_selector( dst_image, lambda x, ordering: distort_color(x, ordering, fast_mode), @@ -315,14 +320,16 @@ def preprocess_for_train(image, labels, bboxes, tf_summary_image(dst_image, bboxes, 'image_color_distorted') # Rescale to VGG input scale. + # 注意dst_image的输出为0~1.0之间,我们需要进行调整恢复为0~255.0作为VGG网络的输入! image = dst_image * 255. + #对图像进行白化操作! image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes - +#对验证集的预处理! def preprocess_for_eval(image, labels, bboxes, out_shape=EVAL_SIZE, data_format='NHWC', difficults=None, resize=Resize.WARP_RESIZE, From ca53926976e27ea77ef13c3c2fee46402c43eee2 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 8 May 2018 16:26:30 +0800 Subject: [PATCH 10/31] Update ssd_vgg_preprocessing.py --- preprocessing/ssd_vgg_preprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/preprocessing/ssd_vgg_preprocessing.py b/preprocessing/ssd_vgg_preprocessing.py index 43adcc4b..b74d5d6c 100644 --- a/preprocessing/ssd_vgg_preprocessing.py +++ b/preprocessing/ssd_vgg_preprocessing.py @@ -329,7 +329,8 @@ def preprocess_for_train(image, labels, bboxes, image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes -#对验证集的预处理! +#对验证集的预处理!可以看出image为任意大小的图片,但是呢,仅仅只有一张,an image而已,所以我们可以看出tf_image.resize_image_bboxes_with_crop_or_pad +#输入的图片只是一个而已,shape为[None,None,3] def preprocess_for_eval(image, labels, bboxes, out_shape=EVAL_SIZE, data_format='NHWC', difficults=None, resize=Resize.WARP_RESIZE, From aee5be0940db86add020c1279b801d76b404f55d Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 8 May 2018 17:25:49 +0800 Subject: [PATCH 11/31] Update tf_image.py --- preprocessing/tf_image.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/preprocessing/tf_image.py b/preprocessing/tf_image.py index a9626266..321679bd 100644 --- a/preprocessing/tf_image.py +++ b/preprocessing/tf_image.py @@ -163,6 +163,7 @@ def bboxes_crop_or_pad(bboxes, return bboxes +#通过中心裁剪或者填充将图片的shape设置为[target_height,target_width,channel] def resize_image_bboxes_with_crop_or_pad(image, bboxes, target_height, target_width): """Crops and/or pads an image to a target width and height. @@ -231,8 +232,11 @@ def equal_(x, y): offset_pad_height = max_(height_diff // 2, 0) # Maybe crop if needed. + # 提供的图片大于target_height,target_width,因此我们进行裁剪 height_crop = min_(target_height, height) width_crop = min_(target_width, width) + #tf.image.crop_to_bounding_box中的offset_crop_height, offset_crop_width是相对于left-top处的偏移 + #height_crop,width_crop是要截取的高和宽,因为crop_to_bounding_box是height,width进行输出的,因此我们使用这种形式!!! cropped = tf.image.crop_to_bounding_box(image, offset_crop_height, offset_crop_width, height_crop, width_crop) bboxes = bboxes_crop_or_pad(bboxes, @@ -240,6 +244,11 @@ def equal_(x, y): -offset_crop_height, -offset_crop_width, height_crop, width_crop) # Maybe pad if needed. + #如果提供的图片的宽高小于height,width的话,我们需要进行填充操作 + '''Adds `offset_height` rows of zeros on top, `offset_width` columns of + zeros on the left, and then pads the image on the bottom and right + with zeros until it has dimensions `target_height`, `target_width`.''' + resized = tf.image.pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width, target_height, target_width) bboxes = bboxes_crop_or_pad(bboxes, From 7ce5cbed68ee455db883d2b06c1aa713b2b97211 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 8 May 2018 21:26:09 +0800 Subject: [PATCH 12/31] Update vgg_preprocessing.py --- preprocessing/vgg_preprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/preprocessing/vgg_preprocessing.py b/preprocessing/vgg_preprocessing.py index a2d0f864..f7aa98e0 100644 --- a/preprocessing/vgg_preprocessing.py +++ b/preprocessing/vgg_preprocessing.py @@ -226,7 +226,8 @@ def _mean_image_subtraction(image, means): num_channels = image.get_shape().as_list()[-1] if len(means) != num_channels: raise ValueError('len(means) must match the number of channels') - + #tf.split函数的作用:将后面tensor按照第几个维度划分成几个tensor,注意划分后的tensor的维度可以不同,所num_channels如果是数字,那就都相同, + #为数组的时候一般是不同的,如原来shape[2]为4,现在num_channels=[1,2,1]那么就不同,如果为[1,1,1,1]则等价于num_channels=4. channels = tf.split(2, num_channels, image) for i in range(num_channels): channels[i] -= means[i] From 0b3e7286be7eb8c31cf978748adc2e2bd253618d Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 8 May 2018 21:27:06 +0800 Subject: [PATCH 13/31] Update vgg_preprocessing.py From fdfd27f5b9b81e334dde993e75676f42a151315c Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Wed, 9 May 2018 17:37:38 +0800 Subject: [PATCH 14/31] Update ssd_common.py --- nets/ssd_common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nets/ssd_common.py b/nets/ssd_common.py index d89d4a60..d148992f 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -272,6 +272,7 @@ def tf_ssd_bboxes_decode(feat_localizations, # =========================================================================== # # SSD boxes selection. # =========================================================================== # +#针对每一层的predictions_layer和localizations_layer进行挑选候选框和对应的位置信息! def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, select_threshold=None, num_classes=21, @@ -294,9 +295,11 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, [predictions_layer, localizations_layer]): # Reshape features: Batches x N x N_labels | 4 p_shape = tfe.get_shape(predictions_layer) + #reshape之后predictioN_layer的shape转变为(batch,n*n,num_classes) predictions_layer = tf.reshape(predictions_layer, tf.stack([p_shape[0], -1, p_shape[-1]])) l_shape = tfe.get_shape(localizations_layer) + #reshape之后localizations_layer的shape转化为(batch,n*n,4) localizations_layer = tf.reshape(localizations_layer, tf.stack([l_shape[0], -1, l_shape[-1]])) @@ -305,9 +308,13 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, for c in range(0, num_classes): if c != ignore_class: # Remove boxes under the threshold. + # 拿到每个预测类的得分,shapes的shape为(batch,n,n),predictions_layer的shape为(batch,n,n,num_classes) scores = predictions_layer[:, :, c] + # 转化,根据该得分判断是否需要保留bboxes,小于select_threshold的候选框都被丢弃 fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype) + #小于select_threshold的scores都设置为0, scores = scores * fmask + #小于select_threshold的bboxes都设置为0, bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1) # Append to dictionary. d_scores[c] = scores From 265d51bcd101de3b326775e22fd265329c3b451a Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Wed, 9 May 2018 19:35:40 +0800 Subject: [PATCH 15/31] Update ssd_common.py --- nets/ssd_common.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nets/ssd_common.py b/nets/ssd_common.py index d148992f..fc57405e 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -273,6 +273,9 @@ def tf_ssd_bboxes_decode(feat_localizations, # SSD boxes selection. # =========================================================================== # #针对每一层的predictions_layer和localizations_layer进行挑选候选框和对应的位置信息! +#注意此时传入的predictions_layer和localizations_layer这些Tensor的维度是5维度, +#例如conv4的predictions_layer的shape为(batch,38,38,4,21),localizations_layer的shape为(batch,38,38,4,4) +#不用太在意维度了,因为在里面我们都会reshape的,然后进行转化! def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, select_threshold=None, num_classes=21, @@ -295,11 +298,11 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, [predictions_layer, localizations_layer]): # Reshape features: Batches x N x N_labels | 4 p_shape = tfe.get_shape(predictions_layer) - #reshape之后predictioN_layer的shape转变为(batch,n*n,num_classes) + #reshape之后predictioN_layer的shape转变为(batch,n*n*num_layer_anchors,num_classes) predictions_layer = tf.reshape(predictions_layer, tf.stack([p_shape[0], -1, p_shape[-1]])) l_shape = tfe.get_shape(localizations_layer) - #reshape之后localizations_layer的shape转化为(batch,n*n,4) + #reshape之后localizations_layer的shape转化为(batch,n*n*num_layer_anchors,4) localizations_layer = tf.reshape(localizations_layer, tf.stack([l_shape[0], -1, l_shape[-1]])) @@ -308,7 +311,7 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, for c in range(0, num_classes): if c != ignore_class: # Remove boxes under the threshold. - # 拿到每个预测类的得分,shapes的shape为(batch,n,n),predictions_layer的shape为(batch,n,n,num_classes) + # 拿到每个预测类的得分,shapes的shape为(batch,n*n*num_layer_anchors),predictions_layer的shape为(batch,n*n*num_layer_anchors,num_classes) scores = predictions_layer[:, :, c] # 转化,根据该得分判断是否需要保留bboxes,小于select_threshold的候选框都被丢弃 fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype) From e2d1f5638e60e1b591ec450da25afc06ae54089d Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Thu, 10 May 2018 14:37:24 +0800 Subject: [PATCH 16/31] Update train_ssd_network.py --- train_ssd_network.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/train_ssd_network.py b/train_ssd_network.py index 9773d156..5438a789 100644 --- a/train_ssd_network.py +++ b/train_ssd_network.py @@ -267,10 +267,23 @@ def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. + + ''' + ssd的核心代码在这一块,我们可以看到 + 1)编码真实的标签,相当于y_label:gclasses, glocalisations, gscores = \ + ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) + 2) b_image, b_gclasses, b_glocalisations, b_gscores = \ + tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) + 3)得到输出,相当于得到y_pred: predictions, localisations, logits, end_points = \ + ssd_net.net(b_image, is_training=True) + 4)计算损失: predictions, localisations, logits, end_points = \ + ssd_net.net(b_image, is_training=True) + ''' b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. + arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope): From 5c5eee60a20964302107a9f3d3e5083718327918 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Thu, 10 May 2018 15:55:34 +0800 Subject: [PATCH 17/31] Update ssd_common.py --- nets/ssd_common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nets/ssd_common.py b/nets/ssd_common.py index fc57405e..31a60430 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -132,11 +132,13 @@ def body(i, feat_labels, feat_scores, #注意针对feat_labels或者feat_scores的更新,我们是不断迭代完成的!!! feat_labels = imask * label + (1 - imask) * feat_labels #注意tf.where函数的用法,https://blog.csdn.net/qq_19332527/article/details/78671280 + #当x,y都没给定的时候,我们就根据condtion来选择输出,condition为True的时候输出对应的坐标! + #when x,y are not None,we choose x's values or y's values to output according to condition, + #when condition is True,we choose x's values,else we choose y's value! feat_scores = tf.where(mask, jaccard, feat_scores) #fx=t*b+(1-t)*fx #这种设置的原因在于,右边的fx(feat_**)代表的是以往的信息 - #即代表 feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax From 94748134a4ffe8863e7f9b1db26e061d4e6b4d14 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Thu, 10 May 2018 16:37:00 +0800 Subject: [PATCH 18/31] Update ssd_vgg_300.py --- nets/ssd_vgg_300.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nets/ssd_vgg_300.py b/nets/ssd_vgg_300.py index c867b004..0773e5ea 100644 --- a/nets/ssd_vgg_300.py +++ b/nets/ssd_vgg_300.py @@ -594,7 +594,6 @@ def ssd_arg_scope_caffe(caffe_scope): # =========================================================================== # # SSD loss function. # =========================================================================== # -#暂时还没看懂这一块,看懂了再更新!!! def ssd_losses(logits, localisations, gclasses, glocalisations, gscores, match_threshold=0.5, @@ -666,12 +665,15 @@ def ssd_losses(logits, localisations, with tf.name_scope('cross_entropy_pos'): loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=gclasses) + #注意我们求得的正负样本,然后就可以计算相应的损失了,注意losses*fpmask,这样就可以计算正样本的损失了!!! loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') + tf.losses.add_loss(loss) with tf.name_scope('cross_entropy_neg'): loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=no_classes) + #注意losses*fnmask,这样就可以计算负样本的损失了!!! loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value') tf.losses.add_loss(loss) @@ -683,7 +685,8 @@ def ssd_losses(logits, localisations, loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value') tf.losses.add_loss(loss) - +#这个函数更容易理解,因为根据iou值得到正负样本,然后再来训练ssd网络,loss函数的求解方法: +#针对分类损失,我们分为两个,分别是正样本的损失和负样本的损失,保持正负样本的比例为1:3,得到的效果最好,在这里我们有tf.losses.compute_weighted_loss体现 def ssd_losses_old(logits, localisations, gclasses, glocalisations, gscores, match_threshold=0.5, From b9e29d644152d870d526fd8681cdcd71b949771d Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Thu, 10 May 2018 16:57:59 +0800 Subject: [PATCH 19/31] Update ssd_common.py --- nets/ssd_common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nets/ssd_common.py b/nets/ssd_common.py index 31a60430..9999bbf9 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -18,6 +18,9 @@ import tensorflow as tf import tf_extended as tfe +#注意tf_ssd_bboxes_encode和tf_ssd_bboxes_select的区别 +#前者是用来判断anchor和实际的ground truth的交集,进而编码每个anchor的classes,bboxes和scores,用于训练; +#后者则是用来从训练好的模型预测图片的输出predictions,localizations中挑选比较靠谱的预测值,用来做结果的可视化; # =========================================================================== # # TensorFlow implementation of boxes SSD encoding / decoding. From 9e7d9a601ab79fb2565eb7f13d7a34e73ff346bf Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Mon, 6 Aug 2018 23:25:52 +0800 Subject: [PATCH 20/31] Create eval_video.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 用来进行视频检测 --- notebooks/eval_video.py | 138 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 notebooks/eval_video.py diff --git a/notebooks/eval_video.py b/notebooks/eval_video.py new file mode 100644 index 00000000..eba777db --- /dev/null +++ b/notebooks/eval_video.py @@ -0,0 +1,138 @@ +#coding=utf-8 + +import os +import math +import random + +import numpy as np +import tensorflow as tf +import cv2 + +slim = tf.contrib.slim + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import sys +sys.path.append('../') + +from nets import ssd_vgg_300, ssd_common, np_methods +from preprocessing import ssd_vgg_preprocessing +#from notebooks import visualization +import visualization +# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!! +gpu_options = tf.GPUOptions(allow_growth=True) +config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) +isess = tf.InteractiveSession(config=config) +VOC_LABELS = { + 0:'none', + 1:'aeroplane', + 2:'bicycle', + 3:'bird' , + 4:'boat', + 5:'bottle', + 6:'bus', + 7:'car', + 8:'cat', + 9:'chair', + 10:'cow', + 11:'diningtable', + 12:'dog', + 13:'horse', + 14:'motorbike', + 15:'person', + 16:'pottedplant', + 17:'sheep', + 18:'sofa', + 19:'train', + 20:'tvmonitor', +} +# Input placeholder. +net_shape = (300, 300) +data_format = 'NHWC' +img_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) +# Evaluation pre-processing: resize to SSD net shape. +image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval( + img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE) +image_4d = tf.expand_dims(image_pre, 0) + +# Define the SSD model. +reuse = True if 'ssd_net' in locals() else None +ssd_net = ssd_vgg_300.SSDNet() +with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)): + predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse) + +# Restore SSD model. +ckpt_filename = '../logs/model.ckpt-20154' #修改为你的模型路径 +#ckpt_filename = 'checkpoints/ssd_300_vgg.ckpt' +isess.run(tf.global_variables_initializer()) +saver = tf.train.Saver() +saver.restore(isess, ckpt_filename) + +# SSD default anchor boxes. +ssd_anchors = ssd_net.anchors(net_shape) + +# Main image processing routine. +def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)): + # Run SSD network. + rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img], + feed_dict={img_input: img}) + + # Get classes and bboxes from the net outputs. + rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select( + rpredictions, rlocalisations, ssd_anchors, + select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True) + + rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes) + rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400) + rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold) + # Resize bboxes to original image shape. Note: useless for Resize.WARP! + rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes) + return rclasses, rscores, rbboxes + +def bboxes_draw_on_img(img, classes, scores, bboxes, color=[255, 0, 0], thickness=2): + shape = img.shape + for i in range(bboxes.shape[0]): + bbox = bboxes[i] + #color = colors[classes[i]] + # Draw bounding box... + p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1])) + p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1])) + cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness) + # Draw text... + s = '%s/%.3f' % (VOC_LABELS[int(classes[i])], scores[i]) + p1 = (p1[0]-5, p1[1]) + cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1) + +cap = cv2.VideoCapture("/media/hp/CCSA_X64FRE/2.MP4") #修改为你的路径 +#cap = cv2.VideoCapture(0) + +# Define the codec and create VideoWriter object +#fourcc = cv2.cv.FOURCC(*'XVID') +fourcc = cv2.VideoWriter_fourcc(*'XVID') +out = cv2.VideoWriter('/media/hp/CCSA_X64FRE/2_handle.MP4', fourcc, 20, (1280, 720)) + + + +num=0 + +while cap.isOpened(): + # get a frame + rval, frame = cap.read() + # save a frame + if rval==True: + # frame = cv2.flip(frame,0) + rclasses, rscores, rbboxes=process_image(frame) + bboxes_draw_on_img(frame,rclasses,rscores,rbboxes) + print(rclasses) + out.write(frame) + num=num+1 + print(num) + else: + break + # show a frame + cv2.imshow("capture", frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + break +cap.release() +out.release() +cv2.destroyAllWindows() From 591afa33b72510da4a7f1dfd39f98fe08f977d34 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Mon, 6 Aug 2018 23:26:38 +0800 Subject: [PATCH 21/31] Create ssd_notebook.py --- notebooks/ssd_notebook.py | 130 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 notebooks/ssd_notebook.py diff --git a/notebooks/ssd_notebook.py b/notebooks/ssd_notebook.py new file mode 100644 index 00000000..48ba6390 --- /dev/null +++ b/notebooks/ssd_notebook.py @@ -0,0 +1,130 @@ + +# coding: utf-8 + +# In[28]: + + +import os +import math +import random + +import numpy as np +import tensorflow as tf +import cv2 + +slim = tf.contrib.slim + + +# In[29]: + + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg + + +# In[30]: + + +import sys +sys.path.append('../') + + +# In[31]: + + +from nets import ssd_vgg_300, ssd_common, np_methods,ssd_vgg_512 +from preprocessing import ssd_vgg_preprocessing +import visualization + + +# In[32]: + + +# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!! +gpu_options = tf.GPUOptions(allow_growth=True) +config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) +isess = tf.InteractiveSession(config=config) + + +# ## SSD 300 Model +# +# The SSD 300 network takes 300x300 image inputs. In order to feed any image, the latter is resize to this input shape (i.e.`Resize.WARP_RESIZE`). Note that even though it may change the ratio width / height, the SSD model performs well on resized images (and it is the default behaviour in the original Caffe implementation). +# +# SSD anchors correspond to the default bounding boxes encoded in the network. The SSD net output provides offset on the coordinates and dimensions of these anchors. + +# In[33]: + + +# Input placeholder. +net_shape = (300, 300) +data_format = 'NHWC' +img_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) +# Evaluation pre-processing: resize to SSD net shape. +image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval( + img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE) +image_4d = tf.expand_dims(image_pre, 0) + +# Define the SSD model. +reuse = True if 'ssd_net' in locals() else None +ssd_net = ssd_vgg_300.SSDNet() +with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)): + predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse) + +# Restore SSD model. +##ckpt_filename = '../checkpoints_all/VGG_VOC0712_SSD_512x512_ft_iter_120000.ckpt/VGG_VOC0712_SSD_512x512_ft_iter_120000.ckpt' +#ckpt_filename="../checkpoints_all/VGG_VOC0712_SSD_300x300_iter_120000.ckpt/VGG_VOC0712_SSD_300x300_iter_120000.ckpt" +ckpt_filename="../logs/model.ckpt-20154" +isess.run(tf.global_variables_initializer()) +saver = tf.train.Saver() +saver.restore(isess, ckpt_filename) + +# SSD default anchor boxes. +ssd_anchors = ssd_net.anchors(net_shape) + + +# ## Post-processing pipeline +# +# The SSD outputs need to be post-processed to provide proper detections. Namely, we follow these common steps: +# +# * Select boxes above a classification threshold; +# * Clip boxes to the image shape; +# * Apply the Non-Maximum-Selection algorithm: fuse together boxes whose Jaccard score > threshold; +# * If necessary, resize bounding boxes to original image shape. + +# In[34]: + + +# Main image processing routine. +def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)): + # Run SSD network. + rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img], + feed_dict={img_input: img}) + + # Get classes and bboxes from the net outputs. + rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select( + rpredictions, rlocalisations, ssd_anchors, + select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True) + + rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes) + rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400) + rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold) + # Resize bboxes to original image shape. Note: useless for Resize.WARP! + rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes) + return rclasses, rscores, rbboxes + + +# In[21]: + + +# Test on some demo image and visualize output. +path = '../demo/' +image_names = sorted(os.listdir(path)) + +#img = mpimg.imread(path + image_names[-5]) +cur_path="/home/hp/Pictures/1.jpg" +path="/home/hp/Pictures/1_1.jpg" +img=mpimg.imread(cur_path) +rclasses, rscores, rbboxes = process_image(img) + +visualization.bboxes_draw_on_img(img, rclasses, rscores, rbboxes, visualization.colors_plasma,path) +#visualization.plt_bboxes(img, rclasses, rscores, rbboxes) From e348481b2861aeebc3e45bd43ba78ffb7f4465b9 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 7 Aug 2018 10:07:53 +0800 Subject: [PATCH 22/31] Create label2xml.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将原有的label.txt转化为PASVOC 2007所提供的xml格式 --- notebooks/label2xml.py | 97 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 notebooks/label2xml.py diff --git a/notebooks/label2xml.py b/notebooks/label2xml.py new file mode 100644 index 00000000..1f045a06 --- /dev/null +++ b/notebooks/label2xml.py @@ -0,0 +1,97 @@ +#coding=utf-8 +import glob +import os + +s1=""" + {0} + Unspecified + 0 + 0 + + {1} + {2} + {3} + {4} + + """ + +s2=""" + VOC2007 + {0} + + My Database + VOC2007 + flickr + NULL + + + NULL + J + + + 512 + 512 + 3 + + 0 + + {1} + Unspecified + 0 + 0 + + {2} + {3} + {4} + {5} + + {6} + +""" +# dict_={0:"micai_jianzhu",1:"yiban_jianzhu",2:"micai_youguan",3:"yiban_youguan",4:"micai_leida",5:"yiban_leida"} +# dict_={1:"car_1",2:"car_2",3:"car_3",4:"car_4"} +dict_={0:"camouflage_car",1:"non_camouflage_car"} +def convert2xml(label_dir,dst_dir_xml): + if os.path.exists(dst_dir_xml)==False: + os.mkdir(dst_dir_xml) + textlist=glob.glob(os.path.join(label_dir,'*.txt')) + # print(len(textlist)) + for text_ in textlist: + flabel = open(text_, 'r') + lb = flabel.readlines() + flabel.close() + lb=[line.strip() for line in lb] + ob2 = "" + x1=lb[0].split(' ')[0] + x1=dict_[int(x1)] + #注意这里如果给定的是中心点的坐标和宽高,需要转化为左上角和右下角的坐标,否则则不转化 + x3=lb[0].split(" ")[1:] + x3=[int(float(x3[0])-float(x3[2])/2.0),int(float(x3[1])-float(x3[3])/2.0),int(float(x3[0])+float(x3[2])/2.0),int(float(x3[1])+float(x3[3])/2.0)] + + if len(lb)>1: # extra annotation + for i in range(1,len(lb)): + cls_id=lb[i].split(' ')[0] + cls_id=dict_[int(cls_id)] + x3_tmp=lb[i].split(' ')[1:] + x3_tmp=[int(float(x3_tmp[0])-float(x3_tmp[2])/2.0),int(float(x3_tmp[1])-float(x3_tmp[3])/2.0),int(float(x3_tmp[0])+float(x3_tmp[2])/2.0),int(float(x3_tmp[1])+float(x3_tmp[3])/2.0)] + ob2+='\n' + s1.format(cls_id,x3_tmp[0],x3_tmp[1],x3_tmp[2],x3_tmp[3]) + # imgname=text_.split("/")[-1].split(".")[0]+'.jpg' + # savename=os.path.join(dst_dir_xml,text_.split("/")[-1].split(".")[0]+'.xml') + tmp_name=text_.split("/")[-1].split(".") + pre_name="" + for i in range(len(tmp_name)-1): + pre_name=pre_name+tmp_name[i]+"." + # print(pre_name) + imgname = pre_name+'jpg' + savename = os.path.join(dst_dir_xml,pre_name+'xml') + f = open(savename, 'w') + ob1=s2.format(imgname, x1, x3[0],x3[1],x3[2],x3[3], ob2) + f.write(ob1) + f.close() + +if __name__=="__main__": + # label_dir="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/labels_std" + # dst_dir_xml="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Annotations" + label_dir="/home/hp/Data/car_train_data/labels_initial_csv" + dst_dir_xml="/home/hp/Data/car_train_data/VOC2007/Annotations" + convert2xml(label_dir,dst_dir_xml) From 7b61975a78330059a8d339de86841bedcfd67cd3 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 7 Aug 2018 10:10:27 +0800 Subject: [PATCH 23/31] Create train_val_split.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将PASVOC中的数据集转化为main文件夹下的train.txt,val.txt和test.txt --- notebooks/train_val_split.py | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 notebooks/train_val_split.py diff --git a/notebooks/train_val_split.py b/notebooks/train_val_split.py new file mode 100644 index 00000000..1d7b1265 --- /dev/null +++ b/notebooks/train_val_split.py @@ -0,0 +1,39 @@ +#coding=utf-8 +import os +import random + +trainval_percent = 0.9 +train_percent = 0.8 +#路径修改为自己的路径 +dir_pre="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/" +xmlfilepath = '/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Annotations/' +txtsavepath = '/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Main/' +total_xml = os.listdir(xmlfilepath) + +num = len(total_xml) +list = range(num) +tv = int(num * trainval_percent) +tr = int(tv * train_percent) +trainval = random.sample(list, tv) +train = random.sample(trainval, tr) + +ftrainval = open(os.path.join(dir_pre,'ImageSets/Main/trainval.txt'), 'w') +ftest = open(os.path.join(dir_pre,'ImageSets/Main/test.txt'), 'w') +ftrain = open(os.path.join(dir_pre,'ImageSets/Main/train.txt'), 'w') +fval = open(os.path.join(dir_pre,'ImageSets/Main/val.txt'), 'w') + +for i in list: + name = total_xml[i][:-4] + '\n' + if i in trainval: + ftrainval.write(name) + if i in train: + ftrain.write(name) + else: + fval.write(name) + else: + ftest.write(name) + +ftrainval.close() +ftrain.close() +fval.close() +ftest.close() From 53fa111eabcd6a2bade421a7ced5610b8ef6c283 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Sat, 11 Aug 2018 20:37:36 +0800 Subject: [PATCH 24/31] Create mAP_tutorial.py --- mAP_tutorial.py | 264 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 mAP_tutorial.py diff --git a/mAP_tutorial.py b/mAP_tutorial.py new file mode 100644 index 00000000..51eee596 --- /dev/null +++ b/mAP_tutorial.py @@ -0,0 +1,264 @@ +#coding=utf-8 +""" +参照voc_detection的标准的mAP计算方法 +用来计算我们自己的数据的值, +mean Average Precision指的是多个类的AP的平均, +怎么平均的mAP, +怎么弄呢,其实就是将多个类的AP求和,然后进行平均,即可得到检测的mAP值。 +""" +import numpy as np +import os +import matplotlib +import matplotlib.pyplot as plt +from pylab import mpl +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +def voc_ap(rec, prec, use_07_metric=False): + if use_07_metric: + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + i = np.where(mrec[1:] != mrec[:-1])[0] + + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + +#如果是self_data数据,则需要对label进行转化,讲中心点加宽高转化为左上加右下 +def read_file(file,classname): + with open(file,'r') as f: + lines=f.readlines() + if classname==-1: + lines=[line.strip().split(" ") for line in lines] + bboxes_to=[] + for item in lines: + bboxes_to.append(item[1:]) + else: + lines_=[line.strip().split(" ") for line in lines] + lines=[] + bboxes_to=[] + for i,line in enumerate(lines_): + if int(line[0])==classname: + lines.append(line) + bboxes_to.append(line[1:]) + # print(bboxes_to_[i]) + # bboxes=[] + # for bbox in bboxes_to: + # item=[float(bbox[0]),float(bbox[1]),float(bbox[2]),float(bbox[3])] + # bboxes.append(item) + # return np.array(bboxes),lines + bboxes = [] + for bbox in bboxes_to: + item = [float(bbox[0])-float(bbox[2])/2.0, float(bbox[1])-float(bbox[3])/2.0, float(bbox[0])+float(bbox[2])/2.0, float(bbox[1])+float(bbox[3])/2.0] + bboxes.append(item) + return np.array(bboxes),lines + +def convert(gt_dir,classname): + class_recs={} + npos=0 + for root,dirs,files in os.walk(gt_dir): + for i,file in enumerate(files): + cur_path=os.path.join(root,file) + bbox,R=read_file(cur_path,classname) + if classname!=-1: + det=[False]*len(R) + npos+=len(R) + class_recs[file]={"bbox":bbox,'det':det} + else: + gt_cls_id=[] + for item in R: + gt_cls_id.append(item[0]) + det = [False] * len(R) + npos += len(R) + class_recs[file] = {"bbox": bbox, 'det': det, "gt_cls_id":gt_cls_id} + print("正在转化中。。。"+str(len(files)-i)) + return class_recs,npos +#更加详细的资料可以查看https://github.com/Tangzixia/Object-Detection-Metrics#average-precision +#计算某个类的AP,class=-1时候代表计算所有类的mAP +def gen_ap(gt_dir,pred_res,classname,iou=0.5): + class_recs,npos=convert(gt_dir,classname) + with open(pred_res,'r') as f: + lines=f.readlines() + #img_id,confidence,BB, + #得分 + splitlines=[item.strip().split(" ") for item in lines] + img_ids=[x[0] for x in splitlines] + cls_flgs=np.array([x[1] for x in splitlines]) + confidence=np.array([float(x[2]) for x in splitlines]) + BB=np.array([[float(z) for z in x[3:]] for x in splitlines]) + + # 找出每一类对应的预测候选框 + # 如果是classname==-1,则说明需要计算所有类的mAP值,这时候需要得到所有的类别标签 + if classname!=-1: + inds=np.zeros(len(splitlines)) + for i,item in enumerate(splitlines): + if int(item[1])==classname: + inds[i]=1 + img_ids_=[] + confidence_=[] + BB_=[] + for i,item in enumerate(splitlines): + if inds[i]==1: + img_ids_.append(img_ids[i]) + confidence_.append(confidence[i]) + BB_.append(BB[i]) + img_ids=img_ids_ + confidence=np.array(confidence_) + BB=np.array(BB_) + # img_ids=list(np.array(img_ids[np.array(inds)])) + # confidence=list(np.array(confidence[np.array(inds)])) + # BB=list(np.array(BB[np.array(inds)])) + + + #confidence由大到小排序 + sorted_ind=np.argsort(-confidence) + # np.argsort(-confidence<=-.3) + sorted_ind1 = np.where(confidence[sorted_ind] >= .0)[0] + sorted_ind = sorted_ind[sorted_ind1] + print(len(sorted_ind)) + BB=BB[sorted_ind,:] + img_ids=[img_ids[x] for x in sorted_ind] + + # sorted_ind = np.argsort(-confidence) + # print(len(sorted_ind)) + # BB = BB[sorted_ind, :] + # img_ids = [img_ids[x] for x in sorted_ind] + + nd=len(img_ids) + print(nd) + tp=np.zeros(nd) + fp=np.zeros(nd) + + for d in range(nd): + R=class_recs[img_ids[d]] + bb=BB[d,:].astype(float) + ovmax = -np.inf + BBGT=R['bbox'].astype(float) + + if BBGT.size>0: + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + overlaps=inters/uni + ovmax=np.max(overlaps) + # print(ovmax) + jmax=np.argmax(overlaps) + if ovmax>iou: + if not R['det'][jmax]: + tp[d]=1 + R['det'][jmax]=1 + else: + fp[d]=1 + else: + fp[d]=1 + else: + # confidence由大到小排序 + sorted_ind = np.argsort(-confidence) + # np.argsort(-confidence<=-.3) + sorted_ind1 = np.where(confidence[sorted_ind] >= .3)[0] + sorted_ind = sorted_ind[sorted_ind1] + BB = BB[sorted_ind, :] + img_ids = [img_ids[x] for x in sorted_ind] + cls_flgs=cls_flgs[sorted_ind] + + # sorted_ind = np.argsort(-confidence) + # print(len(sorted_ind)) + # BB = BB[sorted_ind, :] + # img_ids = [img_ids[x] for x in sorted_ind] + + nd = len(img_ids) + print(nd) + tp = np.zeros(nd) + fp = np.zeros(nd) + + for d in range(nd): + R = class_recs[img_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + if BBGT.size > 0: + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + overlaps = inters / uni + ovmax = np.max(overlaps) + # print(ovmax) + jmax = np.argmax(overlaps) + if ovmax > iou and R['gt_cls_id'][jmax]==cls_flgs[d]: + if not R['det'][jmax]: + tp[d] = 1 + R['det'][jmax] = 1 + else: + fp[d] = 1 + else: + fp[d] = 1 + + fp=np.cumsum(fp) + tp=np.cumsum(tp) + rec=tp/float(npos) + prec=tp/np.maximum(tp+fp,np.finfo(np.float64).eps) + + ap = voc_ap(rec, prec) + return rec,prec,ap +def draw_plot(rec,prec,ap,name,path): + if os.path.exists(path)==False: + os.mkdir(path) + myfont = matplotlib.font_manager.FontProperties(fname="/usr/share/fonts/opentype/noto/NotoSansCJK.ttc") + mpl.rcParams['axes.unicode_minus'] = False + tick=np.arange(0,1.1,0.1) + plt.figure() + plt.title(name+":"+str(ap),fontproperties=myfont) + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.axis([0,1,0,1.05]) + plt.xticks(tick) + plt.yticks(tick) + plt.plot(rec,prec) + # plt.show() + + plt.savefig(os.path.join(path,name+".png")) +if __name__=="__main__": + gt_dir = "/home/hp/Data/house_data/train/Data_valid/labels_initial/" + pred_res = "../res_self_data_0.0.txt" + mAP_file="/home/hp/Desktop/yolov3-res/map.txt" + + dict_ = {"0": u"迷彩建筑", "1": u"一般建筑", "2": u"迷彩油罐", "3": u"一般油罐", "4": u"迷彩雷达", "5": u"一般雷达"} + ap_list=[] + for i in range(6): + classname =i + rec, prec, ap = gen_ap(gt_dir, pred_res, classname) + draw_plot(rec,prec,ap,dict_[str(classname)],path="/home/hp/Desktop/yolov3-res/") + ap_list.append(ap) + print(rec, prec, ap) + with open(mAP_file,'w') as f: + for i,ap in enumerate(ap_list): + f.write(str(dict_[str(i)].decode('utf8'))+":"+str(ap)+"\n") + f.write("mAP:"+str(round(np.array(ap_list).mean(),4))) + print("mAP50的值为:",round(np.array(ap_list).mean(),4)) From 7285a91abfafb5417e73bcf91c3e3ae8b3e5d570 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Mon, 27 Aug 2018 11:30:10 +0800 Subject: [PATCH 25/31] Create mAP_tutorial_std.py --- mAP_tutorial_std.py | 221 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 mAP_tutorial_std.py diff --git a/mAP_tutorial_std.py b/mAP_tutorial_std.py new file mode 100644 index 00000000..01c9e54c --- /dev/null +++ b/mAP_tutorial_std.py @@ -0,0 +1,221 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- + +"""Python implementation of the PASCAL VOC devkit's AP evaluation code.""" + +import cPickle +import logging +import numpy as np +import os +import xml.etree.ElementTree as ET + +logger = logging.getLogger(__name__) + + +def parse_rec(filename): + """Parse a PASCAL VOC xml file.""" + tree = ET.parse(filename) + objects = [] + for obj in tree.findall('object'): + obj_struct = {} + obj_struct['name'] = obj.find('name').text + obj_struct['pose'] = obj.find('pose').text + obj_struct['truncated'] = int(obj.find('truncated').text) + obj_struct['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_struct['bbox'] = [int(bbox.find('xmin').text), + int(bbox.find('ymin').text), + int(bbox.find('xmax').text), + int(bbox.find('ymax').text)] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """Compute VOC AP given precision and recall. If use_07_metric is true, uses + the VOC 07 11-point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval(detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=False): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) + Top level function that does the PASCAL VOC evaluation. + detpath: Path to detections + detpath.format(classname) should produce the detection results file. + annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. + imagesetfile: Text file containing the list of images, one image per line. + classname: Category name (duh) + cachedir: Directory for caching the annotations + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # assumes detections are in detpath.format(classname) + # assumes annotations are in annopath.format(imagename) + # assumes imagesetfile is a text file with each line an image name + # cachedir caches the annotations in a pickle file + + # first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + imageset = os.path.splitext(os.path.basename(imagesetfile))[0] + cachefile = os.path.join(cachedir, imageset + '_annots.pkl') + # read list of images + with open(imagesetfile, 'r') as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + logger.info( + 'Reading annotation for {:d}/{:d}'.format( + i + 1, len(imagenames))) + # save + logger.info('Saving cached annotations to {:s}'.format(cachefile)) + with open(cachefile, 'w') as f: + cPickle.dump(recs, f) + else: + # load + with open(cachefile, 'r') as f: + recs = cPickle.load(f) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj['name'] == classname] + bbox = np.array([x['bbox'] for x in R]) + difficult = np.array([x['difficult'] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, 'r') as f: + lines = f.readlines() + + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap From 5ad9a1ce8cec098ac28478990de971d8be366d4a Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Mon, 27 Aug 2018 11:32:56 +0800 Subject: [PATCH 26/31] Update mAP_tutorial_std.py --- mAP_tutorial_std.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mAP_tutorial_std.py b/mAP_tutorial_std.py index 01c9e54c..63d3e62d 100644 --- a/mAP_tutorial_std.py +++ b/mAP_tutorial_std.py @@ -20,6 +20,17 @@ # Written by Bharath Hariharan # -------------------------------------------------------- + +''' +每个类别的检测结果如下, +comp3_det_test_car.txt: + ... + 2009_000026 0.949297 172.000000 233.000000 191.000000 248.000000 + 2009_000032 0.013737 1.000000 147.000000 114.000000 242.000000 + 2009_000032 0.013737 1.000000 134.000000 94.000000 168.000000 + 2009_000035 0.063948 455.000000 229.000000 491.000000 243.000000 +''' + """Python implementation of the PASCAL VOC devkit's AP evaluation code.""" import cPickle From a82c7c08887a8aedf7677c3487571c5a91ca02eb Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Mon, 27 Aug 2018 11:34:33 +0800 Subject: [PATCH 27/31] Create criteria.txt --- criteria.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 criteria.txt diff --git a/criteria.txt b/criteria.txt new file mode 100644 index 00000000..e2b0826f --- /dev/null +++ b/criteria.txt @@ -0,0 +1,2 @@ +关于目标检测相关的东西,标准的检测方案如下: +http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#SECTION00044000000000000000 From 5968a92936e14aee6e5587bcad289533ecd1a485 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 30 Oct 2018 10:32:05 +0800 Subject: [PATCH 28/31] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4137fb9f..f00a2705 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +# 参考:https://zhuanlan.zhihu.com/p/33544892 讲解 # SSD: Single Shot MultiBox Detector in TensorFlow SSD is an unified framework for object detection with a single network. It has been originally introduced in this research [article](http://arxiv.org/abs/1512.02325). From 6dc0e8c4d700c96d0abf33c3fe5752f9db709d0a Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 30 Oct 2018 11:41:07 +0800 Subject: [PATCH 29/31] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index f00a2705..ca4c7977 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # 参考:https://zhuanlan.zhihu.com/p/33544892 讲解 # SSD: Single Shot MultiBox Detector in TensorFlow + SSD is an unified framework for object detection with a single network. It has been originally introduced in this research [article](http://arxiv.org/abs/1512.02325). This repository contains a TensorFlow re-implementation of the original [Caffe code](https://github.com/weiliu89/caffe/tree/ssd). At present, it only implements VGG-based SSD networks (with 300 and 512 inputs), but the architecture of the project is modular, and should make easy the implementation and training of other SSD variants (ResNet or Inception based for instance). Present TF checkpoints have been directly converted from SSD Caffe models. @@ -26,7 +27,13 @@ and then start a jupyter notebook with ```bash jupyter notebook notebooks/ssd_notebook.ipynb ``` +## 预测流程 +1、根据置信度确定其类别(置信度最大的为对应的类别),滤除背景类别及其置信度小于阈值的预测框; +2、根据先验框和预测的偏移还原回预测框,同时clip操作,防止预测框超出图片; +3、根据置信度对预测框由大到小进行排序,选择top-k个预测框; +4、运行NMS算法,设置NMS阈值,滤除掉那些重叠较大的框,得到最后的预测框; +### 注意NMS算法有两种理解,针对每一类进行NMS和针对所有类别进行NMS操作~ ## Datasets From 68c8aeb13a73dc40f9a9b116d4c29bb7bdce5b99 Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 30 Oct 2018 11:43:45 +0800 Subject: [PATCH 30/31] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ca4c7977..712afea1 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 参考:https://zhuanlan.zhihu.com/p/33544892 讲解 +# 参考:https://zhuanlan.zhihu.com/p/33544892 讲解,注意SSD是属于Faster RCNN那一挂的,都有设置先验框,然后预测的仅仅只是偏移而已~ # SSD: Single Shot MultiBox Detector in TensorFlow @@ -29,10 +29,10 @@ jupyter notebook notebooks/ssd_notebook.ipynb ``` ## 预测流程 -1、根据置信度确定其类别(置信度最大的为对应的类别),滤除背景类别及其置信度小于阈值的预测框; -2、根据先验框和预测的偏移还原回预测框,同时clip操作,防止预测框超出图片; -3、根据置信度对预测框由大到小进行排序,选择top-k个预测框; -4、运行NMS算法,设置NMS阈值,滤除掉那些重叠较大的框,得到最后的预测框; +1、根据置信度确定其类别(置信度最大的为对应的类别),滤除背景类别及其置信度小于阈值的预测框. +2、根据先验框和预测的偏移还原回预测框,同时clip操作,防止预测框超出图片. +3、根据置信度对预测框由大到小进行排序,选择top-k个预测框. +4、运行NMS算法,设置NMS阈值,滤除掉那些重叠较大的框,得到最后的预测框. ### 注意NMS算法有两种理解,针对每一类进行NMS和针对所有类别进行NMS操作~ ## Datasets From 336d3ead42d5e2ccb1bdf16d4a451a51f355fa1c Mon Sep 17 00:00:00 2001 From: Tangzixia Date: Tue, 30 Oct 2018 22:01:12 +0800 Subject: [PATCH 31/31] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 712afea1..f36ba547 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # 参考:https://zhuanlan.zhihu.com/p/33544892 讲解,注意SSD是属于Faster RCNN那一挂的,都有设置先验框,然后预测的仅仅只是偏移而已~ # SSD: Single Shot MultiBox Detector in TensorFlow +## 参考py-faster-rcnn的设置,可以发现其滤除的是confidence score小于0.05和nms thresh>0.3的值,然后生成检测文件~ SSD is an unified framework for object detection with a single network. It has been originally introduced in this research [article](http://arxiv.org/abs/1512.02325).