diff --git a/README.md b/README.md index 4137fb9fb..f36ba5472 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ +# 参考:https://zhuanlan.zhihu.com/p/33544892 讲解,注意SSD是属于Faster RCNN那一挂的,都有设置先验框,然后预测的仅仅只是偏移而已~ # SSD: Single Shot MultiBox Detector in TensorFlow +## 参考py-faster-rcnn的设置,可以发现其滤除的是confidence score小于0.05和nms thresh>0.3的值,然后生成检测文件~ + SSD is an unified framework for object detection with a single network. It has been originally introduced in this research [article](http://arxiv.org/abs/1512.02325). @@ -25,7 +28,13 @@ and then start a jupyter notebook with ```bash jupyter notebook notebooks/ssd_notebook.ipynb ``` +## 预测流程 +1、根据置信度确定其类别(置信度最大的为对应的类别),滤除背景类别及其置信度小于阈值的预测框. +2、根据先验框和预测的偏移还原回预测框,同时clip操作,防止预测框超出图片. +3、根据置信度对预测框由大到小进行排序,选择top-k个预测框. +4、运行NMS算法,设置NMS阈值,滤除掉那些重叠较大的框,得到最后的预测框. +### 注意NMS算法有两种理解,针对每一类进行NMS和针对所有类别进行NMS操作~ ## Datasets diff --git a/criteria.txt b/criteria.txt new file mode 100644 index 000000000..e2b0826ff --- /dev/null +++ b/criteria.txt @@ -0,0 +1,2 @@ +关于目标检测相关的东西,标准的检测方案如下: +http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#SECTION00044000000000000000 diff --git a/mAP_tutorial.py b/mAP_tutorial.py new file mode 100644 index 000000000..51eee5967 --- /dev/null +++ b/mAP_tutorial.py @@ -0,0 +1,264 @@ +#coding=utf-8 +""" +参照voc_detection的标准的mAP计算方法 +用来计算我们自己的数据的值, +mean Average Precision指的是多个类的AP的平均, +怎么平均的mAP, +怎么弄呢,其实就是将多个类的AP求和,然后进行平均,即可得到检测的mAP值。 +""" +import numpy as np +import os +import matplotlib +import matplotlib.pyplot as plt +from pylab import mpl +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +def voc_ap(rec, prec, use_07_metric=False): + if use_07_metric: + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + i = np.where(mrec[1:] != mrec[:-1])[0] + + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + +#如果是self_data数据,则需要对label进行转化,讲中心点加宽高转化为左上加右下 +def read_file(file,classname): + with open(file,'r') as f: + lines=f.readlines() + if classname==-1: + lines=[line.strip().split(" ") for line in lines] + bboxes_to=[] + for item in lines: + bboxes_to.append(item[1:]) + else: + lines_=[line.strip().split(" ") for line in lines] + lines=[] + bboxes_to=[] + for i,line in enumerate(lines_): + if int(line[0])==classname: + lines.append(line) + bboxes_to.append(line[1:]) + # print(bboxes_to_[i]) + # bboxes=[] + # for bbox in bboxes_to: + # item=[float(bbox[0]),float(bbox[1]),float(bbox[2]),float(bbox[3])] + # bboxes.append(item) + # return np.array(bboxes),lines + bboxes = [] + for bbox in bboxes_to: + item = [float(bbox[0])-float(bbox[2])/2.0, float(bbox[1])-float(bbox[3])/2.0, float(bbox[0])+float(bbox[2])/2.0, float(bbox[1])+float(bbox[3])/2.0] + bboxes.append(item) + return np.array(bboxes),lines + +def convert(gt_dir,classname): + class_recs={} + npos=0 + for root,dirs,files in os.walk(gt_dir): + for i,file in enumerate(files): + cur_path=os.path.join(root,file) + bbox,R=read_file(cur_path,classname) + if classname!=-1: + det=[False]*len(R) + npos+=len(R) + class_recs[file]={"bbox":bbox,'det':det} + else: + gt_cls_id=[] + for item in R: + gt_cls_id.append(item[0]) + det = [False] * len(R) + npos += len(R) + class_recs[file] = {"bbox": bbox, 'det': det, "gt_cls_id":gt_cls_id} + print("正在转化中。。。"+str(len(files)-i)) + return class_recs,npos +#更加详细的资料可以查看https://github.com/Tangzixia/Object-Detection-Metrics#average-precision +#计算某个类的AP,class=-1时候代表计算所有类的mAP +def gen_ap(gt_dir,pred_res,classname,iou=0.5): + class_recs,npos=convert(gt_dir,classname) + with open(pred_res,'r') as f: + lines=f.readlines() + #img_id,confidence,BB, + #得分 + splitlines=[item.strip().split(" ") for item in lines] + img_ids=[x[0] for x in splitlines] + cls_flgs=np.array([x[1] for x in splitlines]) + confidence=np.array([float(x[2]) for x in splitlines]) + BB=np.array([[float(z) for z in x[3:]] for x in splitlines]) + + # 找出每一类对应的预测候选框 + # 如果是classname==-1,则说明需要计算所有类的mAP值,这时候需要得到所有的类别标签 + if classname!=-1: + inds=np.zeros(len(splitlines)) + for i,item in enumerate(splitlines): + if int(item[1])==classname: + inds[i]=1 + img_ids_=[] + confidence_=[] + BB_=[] + for i,item in enumerate(splitlines): + if inds[i]==1: + img_ids_.append(img_ids[i]) + confidence_.append(confidence[i]) + BB_.append(BB[i]) + img_ids=img_ids_ + confidence=np.array(confidence_) + BB=np.array(BB_) + # img_ids=list(np.array(img_ids[np.array(inds)])) + # confidence=list(np.array(confidence[np.array(inds)])) + # BB=list(np.array(BB[np.array(inds)])) + + + #confidence由大到小排序 + sorted_ind=np.argsort(-confidence) + # np.argsort(-confidence<=-.3) + sorted_ind1 = np.where(confidence[sorted_ind] >= .0)[0] + sorted_ind = sorted_ind[sorted_ind1] + print(len(sorted_ind)) + BB=BB[sorted_ind,:] + img_ids=[img_ids[x] for x in sorted_ind] + + # sorted_ind = np.argsort(-confidence) + # print(len(sorted_ind)) + # BB = BB[sorted_ind, :] + # img_ids = [img_ids[x] for x in sorted_ind] + + nd=len(img_ids) + print(nd) + tp=np.zeros(nd) + fp=np.zeros(nd) + + for d in range(nd): + R=class_recs[img_ids[d]] + bb=BB[d,:].astype(float) + ovmax = -np.inf + BBGT=R['bbox'].astype(float) + + if BBGT.size>0: + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + overlaps=inters/uni + ovmax=np.max(overlaps) + # print(ovmax) + jmax=np.argmax(overlaps) + if ovmax>iou: + if not R['det'][jmax]: + tp[d]=1 + R['det'][jmax]=1 + else: + fp[d]=1 + else: + fp[d]=1 + else: + # confidence由大到小排序 + sorted_ind = np.argsort(-confidence) + # np.argsort(-confidence<=-.3) + sorted_ind1 = np.where(confidence[sorted_ind] >= .3)[0] + sorted_ind = sorted_ind[sorted_ind1] + BB = BB[sorted_ind, :] + img_ids = [img_ids[x] for x in sorted_ind] + cls_flgs=cls_flgs[sorted_ind] + + # sorted_ind = np.argsort(-confidence) + # print(len(sorted_ind)) + # BB = BB[sorted_ind, :] + # img_ids = [img_ids[x] for x in sorted_ind] + + nd = len(img_ids) + print(nd) + tp = np.zeros(nd) + fp = np.zeros(nd) + + for d in range(nd): + R = class_recs[img_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + if BBGT.size > 0: + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + overlaps = inters / uni + ovmax = np.max(overlaps) + # print(ovmax) + jmax = np.argmax(overlaps) + if ovmax > iou and R['gt_cls_id'][jmax]==cls_flgs[d]: + if not R['det'][jmax]: + tp[d] = 1 + R['det'][jmax] = 1 + else: + fp[d] = 1 + else: + fp[d] = 1 + + fp=np.cumsum(fp) + tp=np.cumsum(tp) + rec=tp/float(npos) + prec=tp/np.maximum(tp+fp,np.finfo(np.float64).eps) + + ap = voc_ap(rec, prec) + return rec,prec,ap +def draw_plot(rec,prec,ap,name,path): + if os.path.exists(path)==False: + os.mkdir(path) + myfont = matplotlib.font_manager.FontProperties(fname="/usr/share/fonts/opentype/noto/NotoSansCJK.ttc") + mpl.rcParams['axes.unicode_minus'] = False + tick=np.arange(0,1.1,0.1) + plt.figure() + plt.title(name+":"+str(ap),fontproperties=myfont) + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.axis([0,1,0,1.05]) + plt.xticks(tick) + plt.yticks(tick) + plt.plot(rec,prec) + # plt.show() + + plt.savefig(os.path.join(path,name+".png")) +if __name__=="__main__": + gt_dir = "/home/hp/Data/house_data/train/Data_valid/labels_initial/" + pred_res = "../res_self_data_0.0.txt" + mAP_file="/home/hp/Desktop/yolov3-res/map.txt" + + dict_ = {"0": u"迷彩建筑", "1": u"一般建筑", "2": u"迷彩油罐", "3": u"一般油罐", "4": u"迷彩雷达", "5": u"一般雷达"} + ap_list=[] + for i in range(6): + classname =i + rec, prec, ap = gen_ap(gt_dir, pred_res, classname) + draw_plot(rec,prec,ap,dict_[str(classname)],path="/home/hp/Desktop/yolov3-res/") + ap_list.append(ap) + print(rec, prec, ap) + with open(mAP_file,'w') as f: + for i,ap in enumerate(ap_list): + f.write(str(dict_[str(i)].decode('utf8'))+":"+str(ap)+"\n") + f.write("mAP:"+str(round(np.array(ap_list).mean(),4))) + print("mAP50的值为:",round(np.array(ap_list).mean(),4)) diff --git a/mAP_tutorial_std.py b/mAP_tutorial_std.py new file mode 100644 index 000000000..63d3e62d8 --- /dev/null +++ b/mAP_tutorial_std.py @@ -0,0 +1,232 @@ +# Copyright (c) 2017-present, Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################## +# +# Based on: +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- + + +''' +每个类别的检测结果如下, +comp3_det_test_car.txt: + ... + 2009_000026 0.949297 172.000000 233.000000 191.000000 248.000000 + 2009_000032 0.013737 1.000000 147.000000 114.000000 242.000000 + 2009_000032 0.013737 1.000000 134.000000 94.000000 168.000000 + 2009_000035 0.063948 455.000000 229.000000 491.000000 243.000000 +''' + +"""Python implementation of the PASCAL VOC devkit's AP evaluation code.""" + +import cPickle +import logging +import numpy as np +import os +import xml.etree.ElementTree as ET + +logger = logging.getLogger(__name__) + + +def parse_rec(filename): + """Parse a PASCAL VOC xml file.""" + tree = ET.parse(filename) + objects = [] + for obj in tree.findall('object'): + obj_struct = {} + obj_struct['name'] = obj.find('name').text + obj_struct['pose'] = obj.find('pose').text + obj_struct['truncated'] = int(obj.find('truncated').text) + obj_struct['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_struct['bbox'] = [int(bbox.find('xmin').text), + int(bbox.find('ymin').text), + int(bbox.find('xmax').text), + int(bbox.find('ymax').text)] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """Compute VOC AP given precision and recall. If use_07_metric is true, uses + the VOC 07 11-point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval(detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=False): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) + Top level function that does the PASCAL VOC evaluation. + detpath: Path to detections + detpath.format(classname) should produce the detection results file. + annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. + imagesetfile: Text file containing the list of images, one image per line. + classname: Category name (duh) + cachedir: Directory for caching the annotations + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # assumes detections are in detpath.format(classname) + # assumes annotations are in annopath.format(imagename) + # assumes imagesetfile is a text file with each line an image name + # cachedir caches the annotations in a pickle file + + # first load gt + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + imageset = os.path.splitext(os.path.basename(imagesetfile))[0] + cachefile = os.path.join(cachedir, imageset + '_annots.pkl') + # read list of images + with open(imagesetfile, 'r') as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + logger.info( + 'Reading annotation for {:d}/{:d}'.format( + i + 1, len(imagenames))) + # save + logger.info('Saving cached annotations to {:s}'.format(cachefile)) + with open(cachefile, 'w') as f: + cPickle.dump(recs, f) + else: + # load + with open(cachefile, 'r') as f: + recs = cPickle.load(f) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj['name'] == classname] + bbox = np.array([x['bbox'] for x in R]) + difficult = np.array([x['difficult'] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, 'r') as f: + lines = f.readlines() + + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap diff --git a/nets/ssd_common.py b/nets/ssd_common.py index 7de1a7e7e..9999bbf91 100644 --- a/nets/ssd_common.py +++ b/nets/ssd_common.py @@ -18,6 +18,9 @@ import tensorflow as tf import tf_extended as tfe +#注意tf_ssd_bboxes_encode和tf_ssd_bboxes_select的区别 +#前者是用来判断anchor和实际的ground truth的交集,进而编码每个anchor的classes,bboxes和scores,用于训练; +#后者则是用来从训练好的模型预测图片的输出predictions,localizations中挑选比较靠谱的预测值,用来做结果的可视化; # =========================================================================== # # TensorFlow implementation of boxes SSD encoding / decoding. @@ -44,7 +47,10 @@ def tf_ssd_bboxes_encode_layer(labels, (target_labels, target_localizations, target_scores): Target Tensors. """ # Anchors coordinates and volume. + #这儿我们拿conv4_3层的参数进行举例,方便理解 + #yref.shape:(38,38),xref.shape:(38,38),href.shape:(4,),wref.shape:(4,) yref, xref, href, wref = anchors_layer + #这样经过运算,这下我们得到的就是(y_min/y_max).shape:(38,38,4),(x_min/x_max).shape:(38,38,4) ymin = yref - href / 2. xmin = xref - wref / 2. ymax = yref + href / 2. @@ -52,6 +58,7 @@ def tf_ssd_bboxes_encode_layer(labels, vol_anchors = (xmax - xmin) * (ymax - ymin) # Initialize tensors... + #同样拿conv4_3层进行举例,可得shape=(38,38,4) shape = (yref.shape[0], yref.shape[1], href.size) feat_labels = tf.zeros(shape, dtype=tf.int64) feat_scores = tf.zeros(shape, dtype=dtype) @@ -61,6 +68,7 @@ def tf_ssd_bboxes_encode_layer(labels, feat_ymax = tf.ones(shape, dtype=dtype) feat_xmax = tf.ones(shape, dtype=dtype) + #同样拿conv4_3举例,经过运算,我们得到的jaccard矩阵的shape为(38,38,4),它得到的值在(0.0~1.0)之间,代表anchor和ground truth box的iou def jaccard_with_anchors(bbox): """Compute jaccard score between a box and the anchors. """ @@ -100,25 +108,40 @@ def condition(i, feat_labels, feat_scores, def body(i, feat_labels, feat_scores, feat_ymin, feat_xmin, feat_ymax, feat_xmax): """Body: update feature labels, scores and bboxes. + #意思是当iou大于0.5的时候,我们就对其进行赋值,但是什么时候才更新呢,直到该anchor与所有ground truth box求得的最大的值进行赋值! Follow the original SSD paper for that purpose: - assign values when jaccard > 0.5; - only update if beat the score of other bboxes. """ # Jaccard score. + #注意此时代表的是一张图片中的几个ground truth,只有几个啦,然后一个anchor就可以对应一个ground truth或者不对应,一个ground truth可以对应多个anchor! + #label代表当前的ground truth box的label,因为cond,body的设置,我们可以看到,是用所有的anchors和第i个ground truth box进行 + #计算,最后得到feat_labels,feat_locations,feat_scores label = labels[i] bbox = bboxes[i] jaccard = jaccard_with_anchors(bbox) # Mask: check threshold + scores + no annotations + num_classes. + # mask.shape:(gc,gc,n_gc),bool型 mask = tf.greater(jaccard, feat_scores) # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold)) mask = tf.logical_and(mask, feat_scores > -0.5) mask = tf.logical_and(mask, label < num_classes) + #这两个分别是干什么的???一个tf.int64,可以看做xijp,那么另一个tf.float32呢,也就是fmask是干什么的东东? + #int型shape:(gc,gc,n_gc) imask = tf.cast(mask, tf.int64) + #float型shape:(gc,gc,n_gc) fmask = tf.cast(mask, dtype) # Update values using mask. + #注意针对feat_labels或者feat_scores的更新,我们是不断迭代完成的!!! feat_labels = imask * label + (1 - imask) * feat_labels + #注意tf.where函数的用法,https://blog.csdn.net/qq_19332527/article/details/78671280 + #当x,y都没给定的时候,我们就根据condtion来选择输出,condition为True的时候输出对应的坐标! + #when x,y are not None,we choose x's values or y's values to output according to condition, + #when condition is True,we choose x's values,else we choose y's value! feat_scores = tf.where(mask, jaccard, feat_scores) + #fx=t*b+(1-t)*fx + #这种设置的原因在于,右边的fx(feat_**)代表的是以往的信息 feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax @@ -142,11 +165,13 @@ def body(i, feat_labels, feat_scores, feat_ymin, feat_xmin, feat_ymax, feat_xmax]) # Transform to center / size. + #计算补偿后的中心 feat_cy = (feat_ymax + feat_ymin) / 2. feat_cx = (feat_xmax + feat_xmin) / 2. feat_h = feat_ymax - feat_ymin feat_w = feat_xmax - feat_xmin # Encode features. + feat_cy = (feat_cy - yref) / href / prior_scaling[0] feat_cx = (feat_cx - xref) / wref / prior_scaling[1] feat_h = tf.log(feat_h / href) / prior_scaling[2] @@ -252,6 +277,10 @@ def tf_ssd_bboxes_decode(feat_localizations, # =========================================================================== # # SSD boxes selection. # =========================================================================== # +#针对每一层的predictions_layer和localizations_layer进行挑选候选框和对应的位置信息! +#注意此时传入的predictions_layer和localizations_layer这些Tensor的维度是5维度, +#例如conv4的predictions_layer的shape为(batch,38,38,4,21),localizations_layer的shape为(batch,38,38,4,4) +#不用太在意维度了,因为在里面我们都会reshape的,然后进行转化! def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, select_threshold=None, num_classes=21, @@ -274,9 +303,11 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, [predictions_layer, localizations_layer]): # Reshape features: Batches x N x N_labels | 4 p_shape = tfe.get_shape(predictions_layer) + #reshape之后predictioN_layer的shape转变为(batch,n*n*num_layer_anchors,num_classes) predictions_layer = tf.reshape(predictions_layer, tf.stack([p_shape[0], -1, p_shape[-1]])) l_shape = tfe.get_shape(localizations_layer) + #reshape之后localizations_layer的shape转化为(batch,n*n*num_layer_anchors,4) localizations_layer = tf.reshape(localizations_layer, tf.stack([l_shape[0], -1, l_shape[-1]])) @@ -285,9 +316,13 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer, for c in range(0, num_classes): if c != ignore_class: # Remove boxes under the threshold. + # 拿到每个预测类的得分,shapes的shape为(batch,n*n*num_layer_anchors),predictions_layer的shape为(batch,n*n*num_layer_anchors,num_classes) scores = predictions_layer[:, :, c] + # 转化,根据该得分判断是否需要保留bboxes,小于select_threshold的候选框都被丢弃 fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype) + #小于select_threshold的scores都设置为0, scores = scores * fmask + #小于select_threshold的bboxes都设置为0, bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1) # Append to dictionary. d_scores[c] = scores diff --git a/nets/ssd_vgg_300.py b/nets/ssd_vgg_300.py index 46c09bf5f..0773e5ea7 100644 --- a/nets/ssd_vgg_300.py +++ b/nets/ssd_vgg_300.py @@ -91,6 +91,7 @@ class SSDNet(object): conv11 ==> 1 x 1 The default image size used to train this network is 300x300. """ + #注意每个参数多代表的含义,注意这儿的anchor_size_bounds和anchor_sizes不匹配,因为根据anchor_sizes的比例,我们可以得到Smin和Smax为[0.07,0.87] default_params = SSDParams( img_shape=(300, 300), num_classes=21, @@ -117,6 +118,7 @@ class SSDNet(object): [2, .5, 3, 1./3], [2, .5], [2, .5]], + #还有anchor_steps代表的每一个feature map中的grid_cell相对于原图的像素比例,例如conv4_3为(38,38),300/38.0=7.89~=8 anchor_steps=[8, 16, 32, 64, 100, 300], anchor_offset=0.5, normalizations=[20, -1, -1, -1, -1, -1], @@ -188,6 +190,7 @@ def anchors(self, img_shape, dtype=np.float32): self.params.anchor_offset, dtype) + #使用ssd的anchors编码ground truth的label和bbox,对所有的特征层编码box def bboxes_encode(self, labels, bboxes, anchors, scope=None): """Encode labels and bounding boxes. @@ -302,7 +305,10 @@ def ssd_feat_shapes_from_net(predictions, default_shapes=None): feat_shapes.append(shape) return feat_shapes - +#这个函数可以得到每层的feature map中的每个feature map cell(简称为fmc)相对于原图的中心点的坐标 +#和相对于当前feature map的宽和高!!!注意h和w都是相对于当前feature map的!!! +#可以得到(y/x).shape--->(g_c,g_c),(w/h).shape--->(n_g_c,),别担心,后面通过tensor的操作可以转化为(g_c,g_c,n_g_c),这里假定‘后feature map’ +#的w和h相同,都为g_c,n_g_c代表当前这层所设置的每个grid_cell所对应的anchor的数量! def ssd_anchor_one_layer(img_shape, feat_shape, sizes, @@ -357,7 +363,8 @@ def ssd_anchor_one_layer(img_shape, w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r) return y, x, h, w - +#通过list的append操作,我们可以得到所有层的anchors的中心点坐标和相对当前“后feature map”的宽高! +#layers_anchors=[(N*38*38*4),(N*19*19*6),(N*10*10*6),(N*5*5*6),(N*3*3*4),(N*1*1*4)],list中我们标示的是tensor所对应的shape! def ssd_anchors_all_layers(img_shape, layers_shape, anchor_sizes, @@ -397,7 +404,7 @@ def tensor_shape(x, rank=3): return [s if s is not None else d for s, d in zip(static_shape, dynamic_shape)] - +#对conv4_3,conv7,conv8,conv9,conv10,conv11层分别进行再次3*3的conv操作,可以将其转化为 def ssd_multibox_layer(inputs, num_classes, sizes, @@ -414,12 +421,18 @@ def ssd_multibox_layer(inputs, # Location. num_loc_pred = num_anchors * 4 + #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作,注意filter的输出, + #这样我们可以转化为(N,g_c,g_c,nlp),注意nlp所代表的含义,即得到对应层的坐标预测输出!!! + #拿conv4_3举例,得到的feature map的shape为(N,38,38,256),这样转化之后可得为(N,38,38,4*4)~ loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None, scope='conv_loc') loc_pred = custom_layers.channel_to_last(loc_pred) loc_pred = tf.reshape(loc_pred, tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4]) # Class prediction. + #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作, + #注意filter的输出,这样我们可以转化为(N,g_c,g_c,ncp),注意ncp所代表的含义,即得到对应曾的分类预测输出! + #拿conv4_3举例,得到的feature map的shape为(N,38,38,256),这样转化之后可得为(N,38,38,4*21)~ num_cls_pred = num_anchors * num_classes cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None, scope='conv_cls') @@ -504,6 +517,12 @@ def ssd_net(inputs, end_points[end_point] = net # Prediction and localisations layers. + #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作, + #得到对应层的分类预测和回归预测! + #这样可以得到predictions/logits的为 + #[(n,38,38,4*21),(n,19,19,6*21),(n,10,10,6*21),(n,5,5,6*21),(n,3,3,4*21),(n,1,1,4*21)],list中的每个元素代表的该tensor的shape + #localisations输出为 + #[(n,38,38,4*4),(n,19,19,6*4),(n,10,10,6*4),(n,5,5,6*4),(n,3,3,4*4),(n,1,1,4*4)],list中的每个元素代表的该tensor的shape predictions = [] logits = [] localisations = [] @@ -594,16 +613,24 @@ def ssd_losses(logits, localisations, fgscores = [] flocalisations = [] fglocalisations = [] + #我们已经看过了上面的logits的输出,现在我们来看看loss中怎么进行处理的! + #因为logits/localisations这个list中有6个tensor,对应了6个不同层的预测/分类输出, + #这样没法处理,所以我们先进行flatten,而后concat,方便进行处理! for i in range(len(logits)): + #reshape之后,flogits中分别得到的shape为(N*5776,21),(N*1444,21),(N*600,21),(N*150,21),(N*36,21),(N*4,21) + #5776=38*38*4,即将logits[i] reshape成了shape[:-1],21 flogits.append(tf.reshape(logits[i], [-1, num_classes])) fgclasses.append(tf.reshape(gclasses[i], [-1])) fgscores.append(tf.reshape(gscores[i], [-1])) + #reshape之后,flocalisations中分别得到的shape为(N*5776,4),(N*1444,4),(N*600,4),(N*150,4),(N*36,4),(N*4,4) flocalisations.append(tf.reshape(localisations[i], [-1, 4])) fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4])) # And concat the crap! + #然后我们进行concat操作,这样就可以得到logits的shape为(8732*N,21) logits = tf.concat(flogits, axis=0) gclasses = tf.concat(fgclasses, axis=0) gscores = tf.concat(fgscores, axis=0) + #localisations的shape为(8732*N,4) localisations = tf.concat(flocalisations, axis=0) glocalisations = tf.concat(fglocalisations, axis=0) dtype = logits.dtype @@ -638,12 +665,15 @@ def ssd_losses(logits, localisations, with tf.name_scope('cross_entropy_pos'): loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=gclasses) + #注意我们求得的正负样本,然后就可以计算相应的损失了,注意losses*fpmask,这样就可以计算正样本的损失了!!! loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value') + tf.losses.add_loss(loss) with tf.name_scope('cross_entropy_neg'): loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=no_classes) + #注意losses*fnmask,这样就可以计算负样本的损失了!!! loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value') tf.losses.add_loss(loss) @@ -655,7 +685,8 @@ def ssd_losses(logits, localisations, loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value') tf.losses.add_loss(loss) - +#这个函数更容易理解,因为根据iou值得到正负样本,然后再来训练ssd网络,loss函数的求解方法: +#针对分类损失,我们分为两个,分别是正样本的损失和负样本的损失,保持正负样本的比例为1:3,得到的效果最好,在这里我们有tf.losses.compute_weighted_loss体现 def ssd_losses_old(logits, localisations, gclasses, glocalisations, gscores, match_threshold=0.5, diff --git a/notebooks/eval_video.py b/notebooks/eval_video.py new file mode 100644 index 000000000..eba777db7 --- /dev/null +++ b/notebooks/eval_video.py @@ -0,0 +1,138 @@ +#coding=utf-8 + +import os +import math +import random + +import numpy as np +import tensorflow as tf +import cv2 + +slim = tf.contrib.slim + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg +import sys +sys.path.append('../') + +from nets import ssd_vgg_300, ssd_common, np_methods +from preprocessing import ssd_vgg_preprocessing +#from notebooks import visualization +import visualization +# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!! +gpu_options = tf.GPUOptions(allow_growth=True) +config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) +isess = tf.InteractiveSession(config=config) +VOC_LABELS = { + 0:'none', + 1:'aeroplane', + 2:'bicycle', + 3:'bird' , + 4:'boat', + 5:'bottle', + 6:'bus', + 7:'car', + 8:'cat', + 9:'chair', + 10:'cow', + 11:'diningtable', + 12:'dog', + 13:'horse', + 14:'motorbike', + 15:'person', + 16:'pottedplant', + 17:'sheep', + 18:'sofa', + 19:'train', + 20:'tvmonitor', +} +# Input placeholder. +net_shape = (300, 300) +data_format = 'NHWC' +img_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) +# Evaluation pre-processing: resize to SSD net shape. +image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval( + img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE) +image_4d = tf.expand_dims(image_pre, 0) + +# Define the SSD model. +reuse = True if 'ssd_net' in locals() else None +ssd_net = ssd_vgg_300.SSDNet() +with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)): + predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse) + +# Restore SSD model. +ckpt_filename = '../logs/model.ckpt-20154' #修改为你的模型路径 +#ckpt_filename = 'checkpoints/ssd_300_vgg.ckpt' +isess.run(tf.global_variables_initializer()) +saver = tf.train.Saver() +saver.restore(isess, ckpt_filename) + +# SSD default anchor boxes. +ssd_anchors = ssd_net.anchors(net_shape) + +# Main image processing routine. +def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)): + # Run SSD network. + rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img], + feed_dict={img_input: img}) + + # Get classes and bboxes from the net outputs. + rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select( + rpredictions, rlocalisations, ssd_anchors, + select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True) + + rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes) + rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400) + rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold) + # Resize bboxes to original image shape. Note: useless for Resize.WARP! + rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes) + return rclasses, rscores, rbboxes + +def bboxes_draw_on_img(img, classes, scores, bboxes, color=[255, 0, 0], thickness=2): + shape = img.shape + for i in range(bboxes.shape[0]): + bbox = bboxes[i] + #color = colors[classes[i]] + # Draw bounding box... + p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1])) + p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1])) + cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness) + # Draw text... + s = '%s/%.3f' % (VOC_LABELS[int(classes[i])], scores[i]) + p1 = (p1[0]-5, p1[1]) + cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1) + +cap = cv2.VideoCapture("/media/hp/CCSA_X64FRE/2.MP4") #修改为你的路径 +#cap = cv2.VideoCapture(0) + +# Define the codec and create VideoWriter object +#fourcc = cv2.cv.FOURCC(*'XVID') +fourcc = cv2.VideoWriter_fourcc(*'XVID') +out = cv2.VideoWriter('/media/hp/CCSA_X64FRE/2_handle.MP4', fourcc, 20, (1280, 720)) + + + +num=0 + +while cap.isOpened(): + # get a frame + rval, frame = cap.read() + # save a frame + if rval==True: + # frame = cv2.flip(frame,0) + rclasses, rscores, rbboxes=process_image(frame) + bboxes_draw_on_img(frame,rclasses,rscores,rbboxes) + print(rclasses) + out.write(frame) + num=num+1 + print(num) + else: + break + # show a frame + cv2.imshow("capture", frame) + if cv2.waitKey(1) & 0xFF == ord('q'): + break +cap.release() +out.release() +cv2.destroyAllWindows() diff --git a/notebooks/label2xml.py b/notebooks/label2xml.py new file mode 100644 index 000000000..1f045a06b --- /dev/null +++ b/notebooks/label2xml.py @@ -0,0 +1,97 @@ +#coding=utf-8 +import glob +import os + +s1=""" + {0} + Unspecified + 0 + 0 + + {1} + {2} + {3} + {4} + + """ + +s2=""" + VOC2007 + {0} + + My Database + VOC2007 + flickr + NULL + + + NULL + J + + + 512 + 512 + 3 + + 0 + + {1} + Unspecified + 0 + 0 + + {2} + {3} + {4} + {5} + + {6} + +""" +# dict_={0:"micai_jianzhu",1:"yiban_jianzhu",2:"micai_youguan",3:"yiban_youguan",4:"micai_leida",5:"yiban_leida"} +# dict_={1:"car_1",2:"car_2",3:"car_3",4:"car_4"} +dict_={0:"camouflage_car",1:"non_camouflage_car"} +def convert2xml(label_dir,dst_dir_xml): + if os.path.exists(dst_dir_xml)==False: + os.mkdir(dst_dir_xml) + textlist=glob.glob(os.path.join(label_dir,'*.txt')) + # print(len(textlist)) + for text_ in textlist: + flabel = open(text_, 'r') + lb = flabel.readlines() + flabel.close() + lb=[line.strip() for line in lb] + ob2 = "" + x1=lb[0].split(' ')[0] + x1=dict_[int(x1)] + #注意这里如果给定的是中心点的坐标和宽高,需要转化为左上角和右下角的坐标,否则则不转化 + x3=lb[0].split(" ")[1:] + x3=[int(float(x3[0])-float(x3[2])/2.0),int(float(x3[1])-float(x3[3])/2.0),int(float(x3[0])+float(x3[2])/2.0),int(float(x3[1])+float(x3[3])/2.0)] + + if len(lb)>1: # extra annotation + for i in range(1,len(lb)): + cls_id=lb[i].split(' ')[0] + cls_id=dict_[int(cls_id)] + x3_tmp=lb[i].split(' ')[1:] + x3_tmp=[int(float(x3_tmp[0])-float(x3_tmp[2])/2.0),int(float(x3_tmp[1])-float(x3_tmp[3])/2.0),int(float(x3_tmp[0])+float(x3_tmp[2])/2.0),int(float(x3_tmp[1])+float(x3_tmp[3])/2.0)] + ob2+='\n' + s1.format(cls_id,x3_tmp[0],x3_tmp[1],x3_tmp[2],x3_tmp[3]) + # imgname=text_.split("/")[-1].split(".")[0]+'.jpg' + # savename=os.path.join(dst_dir_xml,text_.split("/")[-1].split(".")[0]+'.xml') + tmp_name=text_.split("/")[-1].split(".") + pre_name="" + for i in range(len(tmp_name)-1): + pre_name=pre_name+tmp_name[i]+"." + # print(pre_name) + imgname = pre_name+'jpg' + savename = os.path.join(dst_dir_xml,pre_name+'xml') + f = open(savename, 'w') + ob1=s2.format(imgname, x1, x3[0],x3[1],x3[2],x3[3], ob2) + f.write(ob1) + f.close() + +if __name__=="__main__": + # label_dir="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/labels_std" + # dst_dir_xml="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Annotations" + label_dir="/home/hp/Data/car_train_data/labels_initial_csv" + dst_dir_xml="/home/hp/Data/car_train_data/VOC2007/Annotations" + convert2xml(label_dir,dst_dir_xml) diff --git a/notebooks/ssd_notebook.py b/notebooks/ssd_notebook.py new file mode 100644 index 000000000..48ba63905 --- /dev/null +++ b/notebooks/ssd_notebook.py @@ -0,0 +1,130 @@ + +# coding: utf-8 + +# In[28]: + + +import os +import math +import random + +import numpy as np +import tensorflow as tf +import cv2 + +slim = tf.contrib.slim + + +# In[29]: + + +import matplotlib.pyplot as plt +import matplotlib.image as mpimg + + +# In[30]: + + +import sys +sys.path.append('../') + + +# In[31]: + + +from nets import ssd_vgg_300, ssd_common, np_methods,ssd_vgg_512 +from preprocessing import ssd_vgg_preprocessing +import visualization + + +# In[32]: + + +# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!! +gpu_options = tf.GPUOptions(allow_growth=True) +config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options) +isess = tf.InteractiveSession(config=config) + + +# ## SSD 300 Model +# +# The SSD 300 network takes 300x300 image inputs. In order to feed any image, the latter is resize to this input shape (i.e.`Resize.WARP_RESIZE`). Note that even though it may change the ratio width / height, the SSD model performs well on resized images (and it is the default behaviour in the original Caffe implementation). +# +# SSD anchors correspond to the default bounding boxes encoded in the network. The SSD net output provides offset on the coordinates and dimensions of these anchors. + +# In[33]: + + +# Input placeholder. +net_shape = (300, 300) +data_format = 'NHWC' +img_input = tf.placeholder(tf.uint8, shape=(None, None, 3)) +# Evaluation pre-processing: resize to SSD net shape. +image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval( + img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE) +image_4d = tf.expand_dims(image_pre, 0) + +# Define the SSD model. +reuse = True if 'ssd_net' in locals() else None +ssd_net = ssd_vgg_300.SSDNet() +with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)): + predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse) + +# Restore SSD model. +##ckpt_filename = '../checkpoints_all/VGG_VOC0712_SSD_512x512_ft_iter_120000.ckpt/VGG_VOC0712_SSD_512x512_ft_iter_120000.ckpt' +#ckpt_filename="../checkpoints_all/VGG_VOC0712_SSD_300x300_iter_120000.ckpt/VGG_VOC0712_SSD_300x300_iter_120000.ckpt" +ckpt_filename="../logs/model.ckpt-20154" +isess.run(tf.global_variables_initializer()) +saver = tf.train.Saver() +saver.restore(isess, ckpt_filename) + +# SSD default anchor boxes. +ssd_anchors = ssd_net.anchors(net_shape) + + +# ## Post-processing pipeline +# +# The SSD outputs need to be post-processed to provide proper detections. Namely, we follow these common steps: +# +# * Select boxes above a classification threshold; +# * Clip boxes to the image shape; +# * Apply the Non-Maximum-Selection algorithm: fuse together boxes whose Jaccard score > threshold; +# * If necessary, resize bounding boxes to original image shape. + +# In[34]: + + +# Main image processing routine. +def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)): + # Run SSD network. + rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img], + feed_dict={img_input: img}) + + # Get classes and bboxes from the net outputs. + rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select( + rpredictions, rlocalisations, ssd_anchors, + select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True) + + rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes) + rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400) + rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold) + # Resize bboxes to original image shape. Note: useless for Resize.WARP! + rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes) + return rclasses, rscores, rbboxes + + +# In[21]: + + +# Test on some demo image and visualize output. +path = '../demo/' +image_names = sorted(os.listdir(path)) + +#img = mpimg.imread(path + image_names[-5]) +cur_path="/home/hp/Pictures/1.jpg" +path="/home/hp/Pictures/1_1.jpg" +img=mpimg.imread(cur_path) +rclasses, rscores, rbboxes = process_image(img) + +visualization.bboxes_draw_on_img(img, rclasses, rscores, rbboxes, visualization.colors_plasma,path) +#visualization.plt_bboxes(img, rclasses, rscores, rbboxes) diff --git a/notebooks/train_val_split.py b/notebooks/train_val_split.py new file mode 100644 index 000000000..1d7b1265b --- /dev/null +++ b/notebooks/train_val_split.py @@ -0,0 +1,39 @@ +#coding=utf-8 +import os +import random + +trainval_percent = 0.9 +train_percent = 0.8 +#路径修改为自己的路径 +dir_pre="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/" +xmlfilepath = '/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Annotations/' +txtsavepath = '/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Main/' +total_xml = os.listdir(xmlfilepath) + +num = len(total_xml) +list = range(num) +tv = int(num * trainval_percent) +tr = int(tv * train_percent) +trainval = random.sample(list, tv) +train = random.sample(trainval, tr) + +ftrainval = open(os.path.join(dir_pre,'ImageSets/Main/trainval.txt'), 'w') +ftest = open(os.path.join(dir_pre,'ImageSets/Main/test.txt'), 'w') +ftrain = open(os.path.join(dir_pre,'ImageSets/Main/train.txt'), 'w') +fval = open(os.path.join(dir_pre,'ImageSets/Main/val.txt'), 'w') + +for i in list: + name = total_xml[i][:-4] + '\n' + if i in trainval: + ftrainval.write(name) + if i in train: + ftrain.write(name) + else: + fval.write(name) + else: + ftest.write(name) + +ftrainval.close() +ftrain.close() +fval.close() +ftest.close() diff --git a/preprocessing/ssd_vgg_preprocessing.py b/preprocessing/ssd_vgg_preprocessing.py index 413ad3428..b74d5d6c8 100644 --- a/preprocessing/ssd_vgg_preprocessing.py +++ b/preprocessing/ssd_vgg_preprocessing.py @@ -119,7 +119,12 @@ def apply_with_random_selector(x, func, num_cases): func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) for case in range(num_cases)])[0] +#通过下面的两种操作,程序可以通过一张训练图衍生出许多训练样本,通过将训练图像进行预处理,训练得到的神经网络模型可以识别不同大小,方位,色彩等方面的实体。 +#操作包括:distort_color和distorted_bounding_box_crop两个操作 +#通过下面两种设置然后我们将可以得到很多的训练样本,要不真是的训练样本实际是不够的! +#distort_color用于随机的调整图像的色彩,调整亮度,对比度,饱和色相的顺序会影响最后的结果, +#定义多种不同的顺序,随机选择,可以进一步降低无关因素对模型的影响 def distort_color(image, color_ordering=0, fast_mode=True, scope=None): """Distort the color of a Tensor image. @@ -173,6 +178,9 @@ def distort_color(image, color_ordering=0, fast_mode=True, scope=None): return tf.clip_by_value(image, 0.0, 1.0) + +#对图片进行预处理,将图片转化成神经网络的输入层数据。 +#表示随机裁剪图片,仅仅生成一个cropped_image def distorted_bounding_box_crop(image, labels, bboxes, @@ -182,11 +190,14 @@ def distorted_bounding_box_crop(image, max_attempts=200, clip_bboxes=True, scope=None): - """Generates cropped_image using a one of the bboxes randomly distorted. + """ + #注意这个函数的解析表明的是使用其中一个随机扭曲的bbox生成cropped_image + Generates cropped_image using a one of the bboxes randomly distorted. See `tf.image.sample_distorted_bounding_box` for more documentation. Args: + #观察这些参数可以发现,这个函数实际是对一张图片的多个gt bbox中随机选择一个(暂时理解为ground truth bbox)进行随机扭曲,返回cropped_image和bbox等等 image: 3-D Tensor of image (it will be converted to floats in [0, 1]). bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] where each coordinate is [0, 1) and the coordinates are arranged @@ -203,12 +214,21 @@ def distorted_bounding_box_crop(image, region of the image of the specified constraints. After `max_attempts` failures, return the entire image. scope: Optional scope for name_scope. + #注意返回值是一个tuple,分别是cropped_image和distorted bbox,这个我们主要参考tf.image.sample_distorted_bounding_box的实现 Returns: A tuple, a 3-D Tensor cropped_image and the distorted bbox """ with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]): # Each bounding box has shape [1, num_boxes, box coords] and # the coordinates are ordered [ymin, xmin, ymax, xmax]. + # 为什么要用sample_distorted_bounding_box的原因在于可以随机的截取图片中一个块,减小需要关注的物体大小对图像识别算法的影响 + # tf.image.sample_distorted_bounding_box的讲解主要参考:https://blog.csdn.net/tz_zs/article/details/77920116 + # 需要注意的是,返回值的类型为: + # begin: 和 image_size 具有相同的类型。包含 [offset_height, offset_width, 0] 的一维数组。作为 tf.slice 的输入。 + # size: 和 image_size 具有相同的类型。包含 [target_height, target_width, -1] 的一维数组。作为 tf.slice 的输入。 + # 根据begin,size两个参数我们可以tf.slice出来我们所需要的裁剪出来的小图,而bboxes主要用于在图像上面的显示bbox工作!!! + # 那么为什么bboxes的shape为[1,1,4]呢?是不是因为tf.image.sample_distorted_bounding_box函数仅裁出来了一个bbox呢? + # bboxes:shape为 [1, 1, 4] 的三维矩阵,数据类型为float32,表示随机变形后的边界框。作为 tf.image.draw_bounding_boxes 的输入。 bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( tf.shape(image), bounding_boxes=tf.expand_dims(bboxes, 0), @@ -220,6 +240,8 @@ def distorted_bounding_box_crop(image, distort_bbox = distort_bbox[0, 0] # Crop the image to the specified bounding box. + # 注意tf.slice中的begin参数和size参数,begin.shape[-1]=0,size.shape[-1]=-1,可以由tf.image.sample_distorted_bouning_box中确定, + # 然后我们就可以从图像中裁剪我们所期望的小图! cropped_image = tf.slice(image, bbox_begin, bbox_size) # Restore the shape since the dynamic slice loses 3rd dimension. cropped_image.set_shape([None, None, 3]) @@ -229,9 +251,14 @@ def distorted_bounding_box_crop(image, labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes, threshold=BBOX_CROP_OVERLAP, assign_negative=False) + #注意我们的返回值cropped_image的shape为[None,None,3],不用担心,在后面preprocess_for_train中我们会怎样呢,对了没错, + #resize到ssd所需要的大小,所以不用担心哈! return cropped_image, labels, bboxes, distort_bbox +#实际上preprocess_for_train的原因在与缺少训练样本,我们进行处理之后可以增加训练样本,用于训练! +#具体这一块可以参考:TensorFlow图像预处理完整样例 https://blog.8hfq.com/?p=455博客,有详细的记录! +#对训练集的预处理! def preprocess_for_train(image, labels, bboxes, out_shape, data_format='NHWC', scope='ssd_preprocessing_train'): @@ -239,7 +266,7 @@ def preprocess_for_train(image, labels, bboxes, Note that the actual resizing scale is sampled from [`resize_size_min`, `resize_size_max`]. - + #注意底下所给的参数和上面提供的参数不一致,因此我们在程序中关注它的实际参数就可以了! Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. @@ -273,15 +300,19 @@ def preprocess_for_train(image, labels, bboxes, min_object_covered=MIN_OBJECT_COVERED, aspect_ratio_range=CROP_RATIO_RANGE) # Resize image to output size. + #因为distorted_bounding_box_crop返回的图像我们都已经set_shape为了[None,None,3],我们需要将其调整为网络所需要的输入大小, + #所以统一resize为out_shape大小!!! dst_image = tf_image.resize_image(dst_image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) tf_summary_image(dst_image, bboxes, 'image_shape_distorted') # Randomly flip the image horizontally. + #随机左右翻转图像 dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes) # Randomly distort the colors. There are 4 ways to do it. + # 使用一种随机的顺序调整图像的色彩!!! dst_image = apply_with_random_selector( dst_image, lambda x, ordering: distort_color(x, ordering, fast_mode), @@ -289,14 +320,17 @@ def preprocess_for_train(image, labels, bboxes, tf_summary_image(dst_image, bboxes, 'image_color_distorted') # Rescale to VGG input scale. + # 注意dst_image的输出为0~1.0之间,我们需要进行调整恢复为0~255.0作为VGG网络的输入! image = dst_image * 255. + #对图像进行白化操作! image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes - +#对验证集的预处理!可以看出image为任意大小的图片,但是呢,仅仅只有一张,an image而已,所以我们可以看出tf_image.resize_image_bboxes_with_crop_or_pad +#输入的图片只是一个而已,shape为[None,None,3] def preprocess_for_eval(image, labels, bboxes, out_shape=EVAL_SIZE, data_format='NHWC', difficults=None, resize=Resize.WARP_RESIZE, diff --git a/preprocessing/tf_image.py b/preprocessing/tf_image.py index a96262661..321679bd0 100644 --- a/preprocessing/tf_image.py +++ b/preprocessing/tf_image.py @@ -163,6 +163,7 @@ def bboxes_crop_or_pad(bboxes, return bboxes +#通过中心裁剪或者填充将图片的shape设置为[target_height,target_width,channel] def resize_image_bboxes_with_crop_or_pad(image, bboxes, target_height, target_width): """Crops and/or pads an image to a target width and height. @@ -231,8 +232,11 @@ def equal_(x, y): offset_pad_height = max_(height_diff // 2, 0) # Maybe crop if needed. + # 提供的图片大于target_height,target_width,因此我们进行裁剪 height_crop = min_(target_height, height) width_crop = min_(target_width, width) + #tf.image.crop_to_bounding_box中的offset_crop_height, offset_crop_width是相对于left-top处的偏移 + #height_crop,width_crop是要截取的高和宽,因为crop_to_bounding_box是height,width进行输出的,因此我们使用这种形式!!! cropped = tf.image.crop_to_bounding_box(image, offset_crop_height, offset_crop_width, height_crop, width_crop) bboxes = bboxes_crop_or_pad(bboxes, @@ -240,6 +244,11 @@ def equal_(x, y): -offset_crop_height, -offset_crop_width, height_crop, width_crop) # Maybe pad if needed. + #如果提供的图片的宽高小于height,width的话,我们需要进行填充操作 + '''Adds `offset_height` rows of zeros on top, `offset_width` columns of + zeros on the left, and then pads the image on the bottom and right + with zeros until it has dimensions `target_height`, `target_width`.''' + resized = tf.image.pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width, target_height, target_width) bboxes = bboxes_crop_or_pad(bboxes, diff --git a/preprocessing/vgg_preprocessing.py b/preprocessing/vgg_preprocessing.py index a2d0f8644..f7aa98e04 100644 --- a/preprocessing/vgg_preprocessing.py +++ b/preprocessing/vgg_preprocessing.py @@ -226,7 +226,8 @@ def _mean_image_subtraction(image, means): num_channels = image.get_shape().as_list()[-1] if len(means) != num_channels: raise ValueError('len(means) must match the number of channels') - + #tf.split函数的作用:将后面tensor按照第几个维度划分成几个tensor,注意划分后的tensor的维度可以不同,所num_channels如果是数字,那就都相同, + #为数组的时候一般是不同的,如原来shape[2]为4,现在num_channels=[1,2,1]那么就不同,如果为[1,1,1,1]则等价于num_channels=4. channels = tf.split(2, num_channels, image) for i in range(num_channels): channels[i] -= means[i] diff --git a/train_ssd_network.py b/train_ssd_network.py index c93e2b4a5..5438a7896 100644 --- a/train_ssd_network.py +++ b/train_ssd_network.py @@ -229,6 +229,8 @@ def main(_): common_queue_min=10 * FLAGS.batch_size, shuffle=True) # Get for SSD network: image, labels, bboxes. + #slim.dataset_data_provider.DatasetDataProvider解读,https://blog.csdn.net/weixin_35653315/article/details/71023596 + #一次只返回一个img,需要组成batch进行训练!!! [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 'object/label', 'object/bbox']) @@ -238,6 +240,7 @@ def main(_): out_shape=ssd_shape, data_format=DATA_FORMAT) # Encode groundtruth labels and bboxes. + #注意此时仅仅只送入了一张图片进去,only one!!! gclasses, glocalisations, gscores = \ ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) batch_shape = [1] + [len(ssd_anchors)] * 3 @@ -264,10 +267,23 @@ def clone_fn(batch_queue): """Allows data parallelism by creating multiple clones of network_fn.""" # Dequeue batch. + + ''' + ssd的核心代码在这一块,我们可以看到 + 1)编码真实的标签,相当于y_label:gclasses, glocalisations, gscores = \ + ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors) + 2) b_image, b_gclasses, b_glocalisations, b_gscores = \ + tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) + 3)得到输出,相当于得到y_pred: predictions, localisations, logits, end_points = \ + ssd_net.net(b_image, is_training=True) + 4)计算损失: predictions, localisations, logits, end_points = \ + ssd_net.net(b_image, is_training=True) + ''' b_image, b_gclasses, b_glocalisations, b_gscores = \ tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) # Construct SSD network. + arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay, data_format=DATA_FORMAT) with slim.arg_scope(arg_scope):