diff --git a/README.md b/README.md
index 4137fb9fb..f36ba5472 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,7 @@
+# 参考：https://zhuanlan.zhihu.com/p/33544892 讲解，注意SSD是属于Faster RCNN那一挂的，都有设置先验框，然后预测的仅仅只是偏移而已～
 # SSD: Single Shot MultiBox Detector in TensorFlow
+## 参考py-faster-rcnn的设置，可以发现其滤除的是confidence score小于0.05和nms thresh>0.3的值，然后生成检测文件～
+
 
 SSD is an unified framework for object detection with a single network. It has been originally introduced in this research [article](http://arxiv.org/abs/1512.02325).
 
@@ -25,7 +28,13 @@ and then start a jupyter notebook with
 ```bash
 jupyter notebook notebooks/ssd_notebook.ipynb
 ```
+## 预测流程
 
+1、根据置信度确定其类别（置信度最大的为对应的类别），滤除背景类别及其置信度小于阈值的预测框.
+2、根据先验框和预测的偏移还原回预测框，同时clip操作，防止预测框超出图片.
+3、根据置信度对预测框由大到小进行排序，选择top-k个预测框.
+4、运行NMS算法，设置NMS阈值，滤除掉那些重叠较大的框，得到最后的预测框.
+### 注意NMS算法有两种理解，针对每一类进行NMS和针对所有类别进行NMS操作～
 
 ## Datasets
 
diff --git a/criteria.txt b/criteria.txt
new file mode 100644
index 000000000..e2b0826ff
--- /dev/null
+++ b/criteria.txt
@@ -0,0 +1,2 @@
+关于目标检测相关的东西，标准的检测方案如下：
+http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/devkit_doc.html#SECTION00044000000000000000
diff --git a/mAP_tutorial.py b/mAP_tutorial.py
new file mode 100644
index 000000000..51eee5967
--- /dev/null
+++ b/mAP_tutorial.py
@@ -0,0 +1,264 @@
+#coding=utf-8
+"""
+参照voc_detection的标准的mAP计算方法
+用来计算我们自己的数据的值，
+mean Average Precision指的是多个类的AP的平均，
+怎么平均的mAP,
+怎么弄呢，其实就是将多个类的AP求和，然后进行平均，即可得到检测的mAP值。
+"""
+import numpy as np
+import os
+import matplotlib
+import matplotlib.pyplot as plt
+from pylab import mpl
+import sys
+reload(sys)
+sys.setdefaultencoding('utf8')
+
+def voc_ap(rec, prec, use_07_metric=False):
+    if use_07_metric:
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+#如果是self_data数据，则需要对label进行转化，讲中心点加宽高转化为左上加右下
+def read_file(file,classname):
+    with open(file,'r') as f:
+        lines=f.readlines()
+    if classname==-1:
+        lines=[line.strip().split(" ") for line in lines]
+        bboxes_to=[]
+        for item in lines:
+            bboxes_to.append(item[1:])
+    else:
+        lines_=[line.strip().split(" ") for line in lines]
+        lines=[]
+        bboxes_to=[]
+        for i,line in enumerate(lines_):
+            if int(line[0])==classname:
+                lines.append(line)
+                bboxes_to.append(line[1:])
+                # print(bboxes_to_[i])
+    # bboxes=[]
+    # for bbox in bboxes_to:
+    #     item=[float(bbox[0]),float(bbox[1]),float(bbox[2]),float(bbox[3])]
+    #     bboxes.append(item)
+    # return np.array(bboxes),lines
+    bboxes = []
+    for bbox in bboxes_to:
+        item = [float(bbox[0])-float(bbox[2])/2.0, float(bbox[1])-float(bbox[3])/2.0, float(bbox[0])+float(bbox[2])/2.0, float(bbox[1])+float(bbox[3])/2.0]
+        bboxes.append(item)
+    return np.array(bboxes),lines
+
+def convert(gt_dir,classname):
+    class_recs={}
+    npos=0
+    for root,dirs,files in os.walk(gt_dir):
+        for i,file in enumerate(files):
+            cur_path=os.path.join(root,file)
+            bbox,R=read_file(cur_path,classname)
+            if classname!=-1:
+                det=[False]*len(R)
+                npos+=len(R)
+                class_recs[file]={"bbox":bbox,'det':det}
+            else:
+                gt_cls_id=[]
+                for item in R:
+                    gt_cls_id.append(item[0])
+                det = [False] * len(R)
+                npos += len(R)
+                class_recs[file] = {"bbox": bbox, 'det': det, "gt_cls_id":gt_cls_id}
+            print("正在转化中。。。"+str(len(files)-i))
+    return class_recs,npos
+#更加详细的资料可以查看https://github.com/Tangzixia/Object-Detection-Metrics#average-precision
+#计算某个类的AP,class=-1时候代表计算所有类的mAP
+def gen_ap(gt_dir,pred_res,classname,iou=0.5):
+    class_recs,npos=convert(gt_dir,classname)
+    with open(pred_res,'r') as f:
+        lines=f.readlines()
+    #img_id,confidence,BB,
+    #得分
+    splitlines=[item.strip().split(" ") for item in lines]
+    img_ids=[x[0] for x in splitlines]
+    cls_flgs=np.array([x[1] for x in splitlines])
+    confidence=np.array([float(x[2]) for x in splitlines])
+    BB=np.array([[float(z) for z in x[3:]] for x in splitlines])
+
+    # 找出每一类对应的预测候选框
+    # 如果是classname==-1,则说明需要计算所有类的mAP值，这时候需要得到所有的类别标签
+    if classname!=-1:
+        inds=np.zeros(len(splitlines))
+        for i,item in enumerate(splitlines):
+            if int(item[1])==classname:
+                inds[i]=1
+        img_ids_=[]
+        confidence_=[]
+        BB_=[]
+        for i,item in enumerate(splitlines):
+            if inds[i]==1:
+                img_ids_.append(img_ids[i])
+                confidence_.append(confidence[i])
+                BB_.append(BB[i])
+        img_ids=img_ids_
+        confidence=np.array(confidence_)
+        BB=np.array(BB_)
+        # img_ids=list(np.array(img_ids[np.array(inds)]))
+        # confidence=list(np.array(confidence[np.array(inds)]))
+        # BB=list(np.array(BB[np.array(inds)]))
+
+
+        #confidence由大到小排序
+        sorted_ind=np.argsort(-confidence)
+        # np.argsort(-confidence<=-.3)
+        sorted_ind1 = np.where(confidence[sorted_ind] >= .0)[0]
+        sorted_ind = sorted_ind[sorted_ind1]
+        print(len(sorted_ind))
+        BB=BB[sorted_ind,:]
+        img_ids=[img_ids[x] for x in sorted_ind]
+
+        # sorted_ind = np.argsort(-confidence)
+        # print(len(sorted_ind))
+        # BB = BB[sorted_ind, :]
+        # img_ids = [img_ids[x] for x in sorted_ind]
+
+        nd=len(img_ids)
+        print(nd)
+        tp=np.zeros(nd)
+        fp=np.zeros(nd)
+
+        for d in range(nd):
+            R=class_recs[img_ids[d]]
+            bb=BB[d,:].astype(float)
+            ovmax = -np.inf
+            BBGT=R['bbox'].astype(float)
+
+            if BBGT.size>0:
+                ixmin = np.maximum(BBGT[:, 0], bb[0])
+                iymin = np.maximum(BBGT[:, 1], bb[1])
+                ixmax = np.minimum(BBGT[:, 2], bb[2])
+                iymax = np.minimum(BBGT[:, 3], bb[3])
+                iw = np.maximum(ixmax - ixmin + 1., 0.)
+                ih = np.maximum(iymax - iymin + 1., 0.)
+                inters = iw * ih
+
+                uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                       (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                       (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+                overlaps=inters/uni
+                ovmax=np.max(overlaps)
+                # print(ovmax)
+                jmax=np.argmax(overlaps)
+            if ovmax>iou:
+                if not R['det'][jmax]:
+                    tp[d]=1
+                    R['det'][jmax]=1
+                else:
+                    fp[d]=1
+            else:
+                fp[d]=1
+    else:
+        # confidence由大到小排序
+        sorted_ind = np.argsort(-confidence)
+        # np.argsort(-confidence<=-.3)
+        sorted_ind1 = np.where(confidence[sorted_ind] >= .3)[0]
+        sorted_ind = sorted_ind[sorted_ind1]
+        BB = BB[sorted_ind, :]
+        img_ids = [img_ids[x] for x in sorted_ind]
+        cls_flgs=cls_flgs[sorted_ind]
+
+        # sorted_ind = np.argsort(-confidence)
+        # print(len(sorted_ind))
+        # BB = BB[sorted_ind, :]
+        # img_ids = [img_ids[x] for x in sorted_ind]
+
+        nd = len(img_ids)
+        print(nd)
+        tp = np.zeros(nd)
+        fp = np.zeros(nd)
+
+        for d in range(nd):
+            R = class_recs[img_ids[d]]
+            bb = BB[d, :].astype(float)
+            ovmax = -np.inf
+            BBGT = R['bbox'].astype(float)
+            if BBGT.size > 0:
+                ixmin = np.maximum(BBGT[:, 0], bb[0])
+                iymin = np.maximum(BBGT[:, 1], bb[1])
+                ixmax = np.minimum(BBGT[:, 2], bb[2])
+                iymax = np.minimum(BBGT[:, 3], bb[3])
+                iw = np.maximum(ixmax - ixmin + 1., 0.)
+                ih = np.maximum(iymax - iymin + 1., 0.)
+                inters = iw * ih
+
+                uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                       (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                       (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+                overlaps = inters / uni
+                ovmax = np.max(overlaps)
+                # print(ovmax)
+                jmax = np.argmax(overlaps)
+            if ovmax > iou and R['gt_cls_id'][jmax]==cls_flgs[d]:
+                if not R['det'][jmax]:
+                    tp[d] = 1
+                    R['det'][jmax] = 1
+                else:
+                    fp[d] = 1
+            else:
+                fp[d] = 1
+
+    fp=np.cumsum(fp)
+    tp=np.cumsum(tp)
+    rec=tp/float(npos)
+    prec=tp/np.maximum(tp+fp,np.finfo(np.float64).eps)
+
+    ap = voc_ap(rec, prec)
+    return rec,prec,ap
+def draw_plot(rec,prec,ap,name,path):
+    if os.path.exists(path)==False:
+        os.mkdir(path)
+    myfont = matplotlib.font_manager.FontProperties(fname="/usr/share/fonts/opentype/noto/NotoSansCJK.ttc")
+    mpl.rcParams['axes.unicode_minus'] = False
+    tick=np.arange(0,1.1,0.1)
+    plt.figure()
+    plt.title(name+":"+str(ap),fontproperties=myfont)
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.axis([0,1,0,1.05])
+    plt.xticks(tick)
+    plt.yticks(tick)
+    plt.plot(rec,prec)
+    # plt.show()
+
+    plt.savefig(os.path.join(path,name+".png"))
+if __name__=="__main__":
+    gt_dir = "/home/hp/Data/house_data/train/Data_valid/labels_initial/"
+    pred_res = "../res_self_data_0.0.txt"
+    mAP_file="/home/hp/Desktop/yolov3-res/map.txt"
+
+    dict_ = {"0": u"迷彩建筑", "1": u"一般建筑", "2": u"迷彩油罐", "3": u"一般油罐", "4": u"迷彩雷达", "5": u"一般雷达"}
+    ap_list=[]
+    for i in range(6):
+        classname =i
+        rec, prec, ap = gen_ap(gt_dir, pred_res, classname)
+        draw_plot(rec,prec,ap,dict_[str(classname)],path="/home/hp/Desktop/yolov3-res/")
+        ap_list.append(ap)
+        print(rec, prec, ap)
+    with open(mAP_file,'w') as f:
+        for i,ap in enumerate(ap_list):
+            f.write(str(dict_[str(i)].decode('utf8'))+":"+str(ap)+"\n")
+        f.write("mAP:"+str(round(np.array(ap_list).mean(),4)))
+    print("mAP50的值为：",round(np.array(ap_list).mean(),4))
diff --git a/mAP_tutorial_std.py b/mAP_tutorial_std.py
new file mode 100644
index 000000000..63d3e62d8
--- /dev/null
+++ b/mAP_tutorial_std.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+#
+# Based on:
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+
+
+'''
+每个类别的检测结果如下，<image identifier> <confidence> <left> <top> <right> <bottom>
+comp3_det_test_car.txt:
+    ...
+    2009_000026 0.949297 172.000000 233.000000 191.000000 248.000000
+    2009_000032 0.013737 1.000000 147.000000 114.000000 242.000000
+    2009_000032 0.013737 1.000000 134.000000 94.000000 168.000000
+    2009_000035 0.063948 455.000000 229.000000 491.000000 243.000000
+'''
+
+"""Python implementation of the PASCAL VOC devkit's AP evaluation code."""
+
+import cPickle
+import logging
+import numpy as np
+import os
+import xml.etree.ElementTree as ET
+
+logger = logging.getLogger(__name__)
+
+
+def parse_rec(filename):
+    """Parse a PASCAL VOC xml file."""
+    tree = ET.parse(filename)
+    objects = []
+    for obj in tree.findall('object'):
+        obj_struct = {}
+        obj_struct['name'] = obj.find('name').text
+        obj_struct['pose'] = obj.find('pose').text
+        obj_struct['truncated'] = int(obj.find('truncated').text)
+        obj_struct['difficult'] = int(obj.find('difficult').text)
+        bbox = obj.find('bndbox')
+        obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                              int(bbox.find('ymin').text),
+                              int(bbox.find('xmax').text),
+                              int(bbox.find('ymax').text)]
+        objects.append(obj_struct)
+
+    return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False):
+    """rec, prec, ap = voc_eval(detpath,
+                                annopath,
+                                imagesetfile,
+                                classname,
+                                [ovthresh],
+                                [use_07_metric])
+    Top level function that does the PASCAL VOC evaluation.
+    detpath: Path to detections
+        detpath.format(classname) should produce the detection results file.
+    annopath: Path to annotations
+        annopath.format(imagename) should be the xml annotations file.
+    imagesetfile: Text file containing the list of images, one image per line.
+    classname: Category name (duh)
+    cachedir: Directory for caching the annotations
+    [ovthresh]: Overlap threshold (default = 0.5)
+    [use_07_metric]: Whether to use VOC07's 11 point AP computation
+        (default False)
+    """
+    # assumes detections are in detpath.format(classname)
+    # assumes annotations are in annopath.format(imagename)
+    # assumes imagesetfile is a text file with each line an image name
+    # cachedir caches the annotations in a pickle file
+
+    # first load gt
+    if not os.path.isdir(cachedir):
+        os.mkdir(cachedir)
+    imageset = os.path.splitext(os.path.basename(imagesetfile))[0]
+    cachefile = os.path.join(cachedir, imageset + '_annots.pkl')
+    # read list of images
+    with open(imagesetfile, 'r') as f:
+        lines = f.readlines()
+    imagenames = [x.strip() for x in lines]
+
+    if not os.path.isfile(cachefile):
+        # load annots
+        recs = {}
+        for i, imagename in enumerate(imagenames):
+            recs[imagename] = parse_rec(annopath.format(imagename))
+            if i % 100 == 0:
+                logger.info(
+                    'Reading annotation for {:d}/{:d}'.format(
+                        i + 1, len(imagenames)))
+        # save
+        logger.info('Saving cached annotations to {:s}'.format(cachefile))
+        with open(cachefile, 'w') as f:
+            cPickle.dump(recs, f)
+    else:
+        # load
+        with open(cachefile, 'r') as f:
+            recs = cPickle.load(f)
+
+    # extract gt objects for this class
+    class_recs = {}
+    npos = 0
+    for imagename in imagenames:
+        R = [obj for obj in recs[imagename] if obj['name'] == classname]
+        bbox = np.array([x['bbox'] for x in R])
+        difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+        det = [False] * len(R)
+        npos = npos + sum(~difficult)
+        class_recs[imagename] = {'bbox': bbox,
+                                 'difficult': difficult,
+                                 'det': det}
+
+    # read dets
+    detfile = detpath.format(classname)
+    with open(detfile, 'r') as f:
+        lines = f.readlines()
+
+    splitlines = [x.strip().split(' ') for x in lines]
+    image_ids = [x[0] for x in splitlines]
+    confidence = np.array([float(x[1]) for x in splitlines])
+    BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        bb = BB[d, :].astype(float)
+        ovmax = -np.inf
+        BBGT = R['bbox'].astype(float)
+
+        if BBGT.size > 0:
+            # compute overlaps
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+                   (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+                   (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > ovthresh:
+            if not R['difficult'][jmax]:
+                if not R['det'][jmax]:
+                    tp[d] = 1.
+                    R['det'][jmax] = 1
+                else:
+                    fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    return rec, prec, ap
diff --git a/nets/ssd_common.py b/nets/ssd_common.py
index 7de1a7e7e..9999bbf91 100644
--- a/nets/ssd_common.py
+++ b/nets/ssd_common.py
@@ -18,6 +18,9 @@
 import tensorflow as tf
 import tf_extended as tfe
 
+#注意tf_ssd_bboxes_encode和tf_ssd_bboxes_select的区别
+#前者是用来判断anchor和实际的ground truth的交集，进而编码每个anchor的classes，bboxes和scores，用于训练；
+#后者则是用来从训练好的模型预测图片的输出predictions，localizations中挑选比较靠谱的预测值，用来做结果的可视化；
 
 # =========================================================================== #
 # TensorFlow implementation of boxes SSD encoding / decoding.
@@ -44,7 +47,10 @@ def tf_ssd_bboxes_encode_layer(labels,
       (target_labels, target_localizations, target_scores): Target Tensors.
     """
     # Anchors coordinates and volume.
+    #这儿我们拿conv4_3层的参数进行举例，方便理解
+    #yref.shape:(38,38),xref.shape:(38,38),href.shape:(4,),wref.shape:(4,)
     yref, xref, href, wref = anchors_layer
+    #这样经过运算，这下我们得到的就是(y_min/y_max).shape:(38,38,4),(x_min/x_max).shape:(38,38,4)
     ymin = yref - href / 2.
     xmin = xref - wref / 2.
     ymax = yref + href / 2.
@@ -52,6 +58,7 @@ def tf_ssd_bboxes_encode_layer(labels,
     vol_anchors = (xmax - xmin) * (ymax - ymin)
 
     # Initialize tensors...
+    #同样拿conv4_3层进行举例，可得shape=(38,38,4)
     shape = (yref.shape[0], yref.shape[1], href.size)
     feat_labels = tf.zeros(shape, dtype=tf.int64)
     feat_scores = tf.zeros(shape, dtype=dtype)
@@ -61,6 +68,7 @@ def tf_ssd_bboxes_encode_layer(labels,
     feat_ymax = tf.ones(shape, dtype=dtype)
     feat_xmax = tf.ones(shape, dtype=dtype)
 
+    #同样拿conv4_3举例，经过运算，我们得到的jaccard矩阵的shape为(38,38,4)，它得到的值在（0.0～1.0）之间，代表anchor和ground truth box的iou
     def jaccard_with_anchors(bbox):
         """Compute jaccard score between a box and the anchors.
         """
@@ -100,25 +108,40 @@ def condition(i, feat_labels, feat_scores,
     def body(i, feat_labels, feat_scores,
              feat_ymin, feat_xmin, feat_ymax, feat_xmax):
         """Body: update feature labels, scores and bboxes.
+        #意思是当iou大于0.5的时候，我们就对其进行赋值，但是什么时候才更新呢，直到该anchor与所有ground truth box求得的最大的值进行赋值！
         Follow the original SSD paper for that purpose:
           - assign values when jaccard > 0.5;
           - only update if beat the score of other bboxes.
         """
         # Jaccard score.
+        #注意此时代表的是一张图片中的几个ground truth，只有几个啦，然后一个anchor就可以对应一个ground truth或者不对应，一个ground truth可以对应多个anchor！
+        #label代表当前的ground truth box的label，因为cond，body的设置，我们可以看到，是用所有的anchors和第i个ground truth box进行
+        #计算，最后得到feat_labels,feat_locations,feat_scores
         label = labels[i]
         bbox = bboxes[i]
         jaccard = jaccard_with_anchors(bbox)
         # Mask: check threshold + scores + no annotations + num_classes.
+        # mask.shape:(gc,gc,n_gc),bool型
         mask = tf.greater(jaccard, feat_scores)
         # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
         mask = tf.logical_and(mask, feat_scores > -0.5)
         mask = tf.logical_and(mask, label < num_classes)
+        #这两个分别是干什么的？？？一个tf.int64,可以看做xijp,那么另一个tf.float32呢，也就是fmask是干什么的东东？
+        #int型shape:(gc,gc,n_gc)
         imask = tf.cast(mask, tf.int64)
+        #float型shape:(gc,gc,n_gc)
         fmask = tf.cast(mask, dtype)
         # Update values using mask.
+        #注意针对feat_labels或者feat_scores的更新，我们是不断迭代完成的！！！
         feat_labels = imask * label + (1 - imask) * feat_labels
+        #注意tf.where函数的用法，https://blog.csdn.net/qq_19332527/article/details/78671280
+        #当x,y都没给定的时候，我们就根据condtion来选择输出，condition为True的时候输出对应的坐标！
+        #when x,y are not None,we choose x's values or y's values to output according to condition,
+        #when condition is True,we choose x's values,else we choose y's value!
         feat_scores = tf.where(mask, jaccard, feat_scores)
 
+        #fx=t*b+(1-t)*fx
+        #这种设置的原因在于，右边的fx（feat_**)代表的是以往的信息
         feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
         feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
         feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
@@ -142,11 +165,13 @@ def body(i, feat_labels, feat_scores,
                                             feat_ymin, feat_xmin,
                                             feat_ymax, feat_xmax])
     # Transform to center / size.
+    #计算补偿后的中心
     feat_cy = (feat_ymax + feat_ymin) / 2.
     feat_cx = (feat_xmax + feat_xmin) / 2.
     feat_h = feat_ymax - feat_ymin
     feat_w = feat_xmax - feat_xmin
     # Encode features.
+    
     feat_cy = (feat_cy - yref) / href / prior_scaling[0]
     feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
     feat_h = tf.log(feat_h / href) / prior_scaling[2]
@@ -252,6 +277,10 @@ def tf_ssd_bboxes_decode(feat_localizations,
 # =========================================================================== #
 # SSD boxes selection.
 # =========================================================================== #
+#针对每一层的predictions_layer和localizations_layer进行挑选候选框和对应的位置信息！
+#注意此时传入的predictions_layer和localizations_layer这些Tensor的维度是5维度，
+#例如conv4的predictions_layer的shape为（batch，38，38，4，21），localizations_layer的shape为（batch，38，38，4，4）
+#不用太在意维度了，因为在里面我们都会reshape的，然后进行转化！
 def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer,
                                select_threshold=None,
                                num_classes=21,
@@ -274,9 +303,11 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer,
                        [predictions_layer, localizations_layer]):
         # Reshape features: Batches x N x N_labels | 4
         p_shape = tfe.get_shape(predictions_layer)
+        #reshape之后predictioN_layer的shape转变为(batch,n*n*num_layer_anchors,num_classes)
         predictions_layer = tf.reshape(predictions_layer,
                                        tf.stack([p_shape[0], -1, p_shape[-1]]))
         l_shape = tfe.get_shape(localizations_layer)
+        #reshape之后localizations_layer的shape转化为(batch,n*n*num_layer_anchors,4)
         localizations_layer = tf.reshape(localizations_layer,
                                          tf.stack([l_shape[0], -1, l_shape[-1]]))
 
@@ -285,9 +316,13 @@ def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer,
         for c in range(0, num_classes):
             if c != ignore_class:
                 # Remove boxes under the threshold.
+                # 拿到每个预测类的得分，shapes的shape为(batch,n*n*num_layer_anchors),predictions_layer的shape为（batch,n*n*num_layer_anchors,num_classes)
                 scores = predictions_layer[:, :, c]
+                # 转化，根据该得分判断是否需要保留bboxes，小于select_threshold的候选框都被丢弃
                 fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype)
+                #小于select_threshold的scores都设置为0,
                 scores = scores * fmask
+                #小于select_threshold的bboxes都设置为0，
                 bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1)
                 # Append to dictionary.
                 d_scores[c] = scores
diff --git a/nets/ssd_vgg_300.py b/nets/ssd_vgg_300.py
index 46c09bf5f..0773e5ea7 100644
--- a/nets/ssd_vgg_300.py
+++ b/nets/ssd_vgg_300.py
@@ -91,6 +91,7 @@ class SSDNet(object):
       conv11 ==> 1 x 1
     The default image size used to train this network is 300x300.
     """
+    #注意每个参数多代表的含义，注意这儿的anchor_size_bounds和anchor_sizes不匹配，因为根据anchor_sizes的比例，我们可以得到Smin和Smax为[0.07,0.87]
     default_params = SSDParams(
         img_shape=(300, 300),
         num_classes=21,
@@ -117,6 +118,7 @@ class SSDNet(object):
                        [2, .5, 3, 1./3],
                        [2, .5],
                        [2, .5]],
+        #还有anchor_steps代表的每一个feature map中的grid_cell相对于原图的像素比例，例如conv4_3为(38,38），300/38.0=7.89～=8
         anchor_steps=[8, 16, 32, 64, 100, 300],
         anchor_offset=0.5,
         normalizations=[20, -1, -1, -1, -1, -1],
@@ -188,6 +190,7 @@ def anchors(self, img_shape, dtype=np.float32):
                                       self.params.anchor_offset,
                                       dtype)
 
+    #使用ssd的anchors编码ground truth的label和bbox，对所有的特征层编码box
     def bboxes_encode(self, labels, bboxes, anchors,
                       scope=None):
         """Encode labels and bounding boxes.
@@ -302,7 +305,10 @@ def ssd_feat_shapes_from_net(predictions, default_shapes=None):
             feat_shapes.append(shape)
     return feat_shapes
 
-
+#这个函数可以得到每层的feature map中的每个feature map cell（简称为fmc）相对于原图的中心点的坐标
+#和相对于当前feature map的宽和高！！！注意h和w都是相对于当前feature map的！！！
+#可以得到(y/x).shape--->(g_c,g_c),(w/h).shape--->(n_g_c,)，别担心，后面通过tensor的操作可以转化为（g_c,g_c,n_g_c),这里假定‘后feature map’
+#的w和h相同，都为g_c,n_g_c代表当前这层所设置的每个grid_cell所对应的anchor的数量!
 def ssd_anchor_one_layer(img_shape,
                          feat_shape,
                          sizes,
@@ -357,7 +363,8 @@ def ssd_anchor_one_layer(img_shape,
         w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
     return y, x, h, w
 
-
+#通过list的append操作，我们可以得到所有层的anchors的中心点坐标和相对当前“后feature map”的宽高！
+#layers_anchors=[(N*38*38*4),(N*19*19*6),(N*10*10*6),(N*5*5*6),(N*3*3*4),(N*1*1*4)]，list中我们标示的是tensor所对应的shape！
 def ssd_anchors_all_layers(img_shape,
                            layers_shape,
                            anchor_sizes,
@@ -397,7 +404,7 @@ def tensor_shape(x, rank=3):
         return [s if s is not None else d
                 for s, d in zip(static_shape, dynamic_shape)]
 
-
+#对conv4_3,conv7,conv8,conv9,conv10,conv11层分别进行再次3*3的conv操作，可以将其转化为
 def ssd_multibox_layer(inputs,
                        num_classes,
                        sizes,
@@ -414,12 +421,18 @@ def ssd_multibox_layer(inputs,
 
     # Location.
     num_loc_pred = num_anchors * 4
+    #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作，注意filter的输出，
+    #这样我们可以转化为（N，g_c，g_c，nlp),注意nlp所代表的含义，即得到对应层的坐标预测输出！！！
+    #拿conv4_3举例，得到的feature map的shape为（N,38,38,256),这样转化之后可得为(N,38,38,4*4)～
     loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
                            scope='conv_loc')
     loc_pred = custom_layers.channel_to_last(loc_pred)
     loc_pred = tf.reshape(loc_pred,
                           tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
     # Class prediction.
+    #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作，
+    #注意filter的输出，这样我们可以转化为（N,g_c,g_c,ncp),注意ncp所代表的含义,即得到对应曾的分类预测输出！
+    #拿conv4_3举例，得到的feature map的shape为（N,38,38,256),这样转化之后可得为(N,38,38,4*21)～
     num_cls_pred = num_anchors * num_classes
     cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,
                            scope='conv_cls')
@@ -504,6 +517,12 @@ def ssd_net(inputs,
         end_points[end_point] = net
 
         # Prediction and localisations layers.
+        #对conv4_3,conv7,conv8,conv9,conv10,conv11中我们选定的某一层进行conv操作，
+        #得到对应层的分类预测和回归预测！
+        #这样可以得到predictions/logits的为
+        #[(n,38,38,4*21),(n,19,19,6*21),(n,10,10,6*21),(n,5,5,6*21),(n,3,3,4*21),(n,1,1,4*21)]，list中的每个元素代表的该tensor的shape
+        #localisations输出为
+        #[(n,38,38,4*4),(n,19,19,6*4),(n,10,10,6*4),(n,5,5,6*4),(n,3,3,4*4),(n,1,1,4*4)]，list中的每个元素代表的该tensor的shape
         predictions = []
         logits = []
         localisations = []
@@ -594,16 +613,24 @@ def ssd_losses(logits, localisations,
         fgscores = []
         flocalisations = []
         fglocalisations = []
+        #我们已经看过了上面的logits的输出，现在我们来看看loss中怎么进行处理的！
+        #因为logits/localisations这个list中有6个tensor，对应了6个不同层的预测/分类输出，
+        #这样没法处理，所以我们先进行flatten，而后concat，方便进行处理！
         for i in range(len(logits)):
+            #reshape之后，flogits中分别得到的shape为(N*5776,21),(N*1444,21),(N*600,21),(N*150,21),(N*36,21),(N*4,21)
+            #5776=38*38*4,即将logits[i] reshape成了shape[:-1],21
             flogits.append(tf.reshape(logits[i], [-1, num_classes]))
             fgclasses.append(tf.reshape(gclasses[i], [-1]))
             fgscores.append(tf.reshape(gscores[i], [-1]))
+            #reshape之后，flocalisations中分别得到的shape为(N*5776,4),(N*1444,4),(N*600,4),(N*150,4),(N*36,4),(N*4,4)
             flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
             fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
         # And concat the crap!
+        #然后我们进行concat操作，这样就可以得到logits的shape为(8732*N,21)
         logits = tf.concat(flogits, axis=0)
         gclasses = tf.concat(fgclasses, axis=0)
         gscores = tf.concat(fgscores, axis=0)
+        #localisations的shape为(8732*N,4)
         localisations = tf.concat(flocalisations, axis=0)
         glocalisations = tf.concat(fglocalisations, axis=0)
         dtype = logits.dtype
@@ -638,12 +665,15 @@ def ssd_losses(logits, localisations,
         with tf.name_scope('cross_entropy_pos'):
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                   labels=gclasses)
+            #注意我们求得的正负样本，然后就可以计算相应的损失了，注意losses*fpmask，这样就可以计算正样本的损失了！！！
             loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')
+            
             tf.losses.add_loss(loss)
 
         with tf.name_scope('cross_entropy_neg'):
             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                                   labels=no_classes)
+            #注意losses*fnmask，这样就可以计算负样本的损失了！！！
             loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
             tf.losses.add_loss(loss)
 
@@ -655,7 +685,8 @@ def ssd_losses(logits, localisations,
             loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
             tf.losses.add_loss(loss)
 
-
+#这个函数更容易理解，因为根据iou值得到正负样本，然后再来训练ssd网络，loss函数的求解方法：
+#针对分类损失，我们分为两个，分别是正样本的损失和负样本的损失，保持正负样本的比例为1:3，得到的效果最好，在这里我们有tf.losses.compute_weighted_loss体现
 def ssd_losses_old(logits, localisations,
                    gclasses, glocalisations, gscores,
                    match_threshold=0.5,
diff --git a/notebooks/eval_video.py b/notebooks/eval_video.py
new file mode 100644
index 000000000..eba777db7
--- /dev/null
+++ b/notebooks/eval_video.py
@@ -0,0 +1,138 @@
+#coding=utf-8
+
+import os
+import math
+import random
+
+import numpy as np
+import tensorflow as tf
+import cv2
+
+slim = tf.contrib.slim
+
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+import sys
+sys.path.append('../')
+
+from nets import ssd_vgg_300, ssd_common, np_methods
+from preprocessing import ssd_vgg_preprocessing
+#from notebooks import visualization
+import visualization
+# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!!
+gpu_options = tf.GPUOptions(allow_growth=True)
+config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)
+isess = tf.InteractiveSession(config=config)
+VOC_LABELS = {
+    0:'none',
+    1:'aeroplane',
+    2:'bicycle',
+    3:'bird' ,
+    4:'boat',
+    5:'bottle',
+    6:'bus',
+    7:'car',
+    8:'cat',
+    9:'chair',
+    10:'cow',
+    11:'diningtable',
+    12:'dog',
+    13:'horse',
+    14:'motorbike',
+    15:'person',
+    16:'pottedplant',
+    17:'sheep',
+    18:'sofa',
+    19:'train',
+    20:'tvmonitor',
+}
+# Input placeholder.
+net_shape = (300, 300)
+data_format = 'NHWC'
+img_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
+# Evaluation pre-processing: resize to SSD net shape.
+image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval(
+    img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE)
+image_4d = tf.expand_dims(image_pre, 0)
+
+# Define the SSD model.
+reuse = True if 'ssd_net' in locals() else None
+ssd_net = ssd_vgg_300.SSDNet()
+with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
+    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)
+
+# Restore SSD model.
+ckpt_filename = '../logs/model.ckpt-20154'  #修改为你的模型路径
+#ckpt_filename = 'checkpoints/ssd_300_vgg.ckpt'
+isess.run(tf.global_variables_initializer())
+saver = tf.train.Saver()
+saver.restore(isess, ckpt_filename)
+
+# SSD default anchor boxes.
+ssd_anchors = ssd_net.anchors(net_shape)
+
+# Main image processing routine.
+def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)):
+    # Run SSD network.
+    rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img],
+                                                              feed_dict={img_input: img})
+
+    # Get classes and bboxes from the net outputs.
+    rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
+            rpredictions, rlocalisations, ssd_anchors,
+            select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
+
+    rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
+    rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
+    rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
+    # Resize bboxes to original image shape. Note: useless for Resize.WARP!
+    rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)
+    return rclasses, rscores, rbboxes
+
+def bboxes_draw_on_img(img, classes, scores, bboxes, color=[255, 0, 0], thickness=2):
+    shape = img.shape
+    for i in range(bboxes.shape[0]):
+        bbox = bboxes[i]
+        #color = colors[classes[i]]
+        # Draw bounding box...
+        p1 = (int(bbox[0] * shape[0]), int(bbox[1] * shape[1]))
+        p2 = (int(bbox[2] * shape[0]), int(bbox[3] * shape[1]))
+        cv2.rectangle(img, p1[::-1], p2[::-1], color, thickness)
+        # Draw text...
+        s = '%s/%.3f' % (VOC_LABELS[int(classes[i])], scores[i])
+        p1 = (p1[0]-5, p1[1])
+        cv2.putText(img, s, p1[::-1], cv2.FONT_HERSHEY_DUPLEX, 0.4, color, 1)
+
+cap = cv2.VideoCapture("/media/hp/CCSA_X64FRE/2.MP4") #修改为你的路径
+#cap = cv2.VideoCapture(0)
+
+# Define the codec and create VideoWriter object
+#fourcc = cv2.cv.FOURCC(*'XVID')
+fourcc = cv2.VideoWriter_fourcc(*'XVID')
+out = cv2.VideoWriter('/media/hp/CCSA_X64FRE/2_handle.MP4', fourcc, 20, (1280, 720))
+
+
+
+num=0
+
+while cap.isOpened():
+    # get a frame
+    rval, frame = cap.read()
+    # save a frame
+    if rval==True:
+      #  frame = cv2.flip(frame,0)
+        rclasses, rscores, rbboxes=process_image(frame)
+        bboxes_draw_on_img(frame,rclasses,rscores,rbboxes)
+        print(rclasses)
+        out.write(frame)
+        num=num+1
+        print(num)
+    else:
+        break
+    # show a frame
+    cv2.imshow("capture", frame)
+    if cv2.waitKey(1) & 0xFF == ord('q'):
+        break
+cap.release()
+out.release()
+cv2.destroyAllWindows()
diff --git a/notebooks/label2xml.py b/notebooks/label2xml.py
new file mode 100644
index 000000000..1f045a06b
--- /dev/null
+++ b/notebooks/label2xml.py
@@ -0,0 +1,97 @@
+#coding=utf-8
+import glob
+import os
+
+s1="""    <object>
+        <name>{0}</name>
+        <pose>Unspecified</pose>
+        <truncated>0</truncated>
+        <difficult>0</difficult>
+        <bndbox>
+            <xmin>{1}</xmin>
+            <ymin>{2}</ymin>
+            <xmax>{3}</xmax>
+            <ymax>{4}</ymax>
+        </bndbox>
+    </object>"""
+
+s2="""<annotation>
+    <folder>VOC2007</folder>
+    <filename>{0}</filename>
+    <source>
+        <database>My Database</database>
+        <annotation>VOC2007</annotation>
+        <image>flickr</image>
+        <flickrid>NULL</flickrid>
+    </source>
+    <owner>
+        <flickrid>NULL</flickrid>
+        <name>J</name>
+    </owner>
+    <size>
+        <width>512</width>
+        <height>512</height>
+        <depth>3</depth>
+    </size>
+    <segmented>0</segmented>
+    <object>
+        <name>{1}</name>
+        <pose>Unspecified</pose>
+        <truncated>0</truncated>
+        <difficult>0</difficult>
+        <bndbox>
+            <xmin>{2}</xmin>
+            <ymin>{3}</ymin>
+            <xmax>{4}</xmax>
+            <ymax>{5}</ymax>
+        </bndbox>
+    </object>{6}
+</annotation>
+"""
+# dict_={0:"micai_jianzhu",1:"yiban_jianzhu",2:"micai_youguan",3:"yiban_youguan",4:"micai_leida",5:"yiban_leida"}
+# dict_={1:"car_1",2:"car_2",3:"car_3",4:"car_4"}
+dict_={0:"camouflage_car",1:"non_camouflage_car"}
+def convert2xml(label_dir,dst_dir_xml):
+	if os.path.exists(dst_dir_xml)==False:
+		os.mkdir(dst_dir_xml)
+	textlist=glob.glob(os.path.join(label_dir,'*.txt'))
+	# print(len(textlist))
+	for text_ in textlist:
+		flabel = open(text_, 'r')
+		lb = flabel.readlines()
+		flabel.close()
+		lb=[line.strip() for line in lb]
+		ob2 = ""
+		x1=lb[0].split(' ')[0]
+		x1=dict_[int(x1)]
+    #注意这里如果给定的是中心点的坐标和宽高，需要转化为左上角和右下角的坐标，否则则不转化
+		x3=lb[0].split(" ")[1:]
+		x3=[int(float(x3[0])-float(x3[2])/2.0),int(float(x3[1])-float(x3[3])/2.0),int(float(x3[0])+float(x3[2])/2.0),int(float(x3[1])+float(x3[3])/2.0)]
+
+		if len(lb)>1:  # extra annotation
+			for i in range(1,len(lb)):
+				cls_id=lb[i].split(' ')[0]
+				cls_id=dict_[int(cls_id)]
+				x3_tmp=lb[i].split(' ')[1:]
+				x3_tmp=[int(float(x3_tmp[0])-float(x3_tmp[2])/2.0),int(float(x3_tmp[1])-float(x3_tmp[3])/2.0),int(float(x3_tmp[0])+float(x3_tmp[2])/2.0),int(float(x3_tmp[1])+float(x3_tmp[3])/2.0)]
+				ob2+='\n' + s1.format(cls_id,x3_tmp[0],x3_tmp[1],x3_tmp[2],x3_tmp[3])
+		# imgname=text_.split("/")[-1].split(".")[0]+'.jpg'
+		# savename=os.path.join(dst_dir_xml,text_.split("/")[-1].split(".")[0]+'.xml')
+		tmp_name=text_.split("/")[-1].split(".")
+		pre_name=""
+		for i in range(len(tmp_name)-1):
+			pre_name=pre_name+tmp_name[i]+"."
+		# print(pre_name)
+		imgname = pre_name+'jpg'
+		savename = os.path.join(dst_dir_xml,pre_name+'xml')
+		f = open(savename, 'w')
+		ob1=s2.format(imgname, x1, x3[0],x3[1],x3[2],x3[3], ob2)
+		f.write(ob1)
+		f.close()
+
+if __name__=="__main__":
+	# label_dir="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/labels_std"
+	# dst_dir_xml="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Annotations"
+	label_dir="/home/hp/Data/car_train_data/labels_initial_csv"
+	dst_dir_xml="/home/hp/Data/car_train_data/VOC2007/Annotations"
+	convert2xml(label_dir,dst_dir_xml)
diff --git a/notebooks/ssd_notebook.py b/notebooks/ssd_notebook.py
new file mode 100644
index 000000000..48ba63905
--- /dev/null
+++ b/notebooks/ssd_notebook.py
@@ -0,0 +1,130 @@
+
+# coding: utf-8
+
+# In[28]:
+
+
+import os
+import math
+import random
+
+import numpy as np
+import tensorflow as tf
+import cv2
+
+slim = tf.contrib.slim
+
+
+# In[29]:
+
+
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+
+
+# In[30]:
+
+
+import sys
+sys.path.append('../')
+
+
+# In[31]:
+
+
+from nets import ssd_vgg_300, ssd_common, np_methods,ssd_vgg_512
+from preprocessing import ssd_vgg_preprocessing
+import visualization
+
+
+# In[32]:
+
+
+# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!!
+gpu_options = tf.GPUOptions(allow_growth=True)
+config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)
+isess = tf.InteractiveSession(config=config)
+
+
+# ## SSD 300 Model
+#
+# The SSD 300 network takes 300x300 image inputs. In order to feed any image, the latter is resize to this input shape (i.e.`Resize.WARP_RESIZE`). Note that even though it may change the ratio width / height, the SSD model performs well on resized images (and it is the default behaviour in the original Caffe implementation).
+#
+# SSD anchors correspond to the default bounding boxes encoded in the network. The SSD net output provides offset on the coordinates and dimensions of these anchors.
+
+# In[33]:
+
+
+# Input placeholder.
+net_shape = (300, 300)
+data_format = 'NHWC'
+img_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
+# Evaluation pre-processing: resize to SSD net shape.
+image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval(
+    img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE)
+image_4d = tf.expand_dims(image_pre, 0)
+
+# Define the SSD model.
+reuse = True if 'ssd_net' in locals() else None
+ssd_net = ssd_vgg_300.SSDNet()
+with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
+    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)
+
+# Restore SSD model.
+##ckpt_filename = '../checkpoints_all/VGG_VOC0712_SSD_512x512_ft_iter_120000.ckpt/VGG_VOC0712_SSD_512x512_ft_iter_120000.ckpt'
+#ckpt_filename="../checkpoints_all/VGG_VOC0712_SSD_300x300_iter_120000.ckpt/VGG_VOC0712_SSD_300x300_iter_120000.ckpt"
+ckpt_filename="../logs/model.ckpt-20154"
+isess.run(tf.global_variables_initializer())
+saver = tf.train.Saver()
+saver.restore(isess, ckpt_filename)
+
+# SSD default anchor boxes.
+ssd_anchors = ssd_net.anchors(net_shape)
+
+
+# ## Post-processing pipeline
+#
+# The SSD outputs need to be post-processed to provide proper detections. Namely, we follow these common steps:
+#
+# * Select boxes above a classification threshold;
+# * Clip boxes to the image shape;
+# * Apply the Non-Maximum-Selection algorithm: fuse together boxes whose Jaccard score > threshold;
+# * If necessary, resize bounding boxes to original image shape.
+
+# In[34]:
+
+
+# Main image processing routine.
+def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)):
+    # Run SSD network.
+    rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img],
+                                                              feed_dict={img_input: img})
+
+    # Get classes and bboxes from the net outputs.
+    rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
+            rpredictions, rlocalisations, ssd_anchors,
+            select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
+
+    rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
+    rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
+    rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
+    # Resize bboxes to original image shape. Note: useless for Resize.WARP!
+    rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)
+    return rclasses, rscores, rbboxes
+
+
+# In[21]:
+
+
+# Test on some demo image and visualize output.
+path = '../demo/'
+image_names = sorted(os.listdir(path))
+
+#img = mpimg.imread(path + image_names[-5])
+cur_path="/home/hp/Pictures/1.jpg"
+path="/home/hp/Pictures/1_1.jpg"
+img=mpimg.imread(cur_path)
+rclasses, rscores, rbboxes =  process_image(img)
+
+visualization.bboxes_draw_on_img(img, rclasses, rscores, rbboxes, visualization.colors_plasma,path)
+#visualization.plt_bboxes(img, rclasses, rscores, rbboxes)
diff --git a/notebooks/train_val_split.py b/notebooks/train_val_split.py
new file mode 100644
index 000000000..1d7b1265b
--- /dev/null
+++ b/notebooks/train_val_split.py
@@ -0,0 +1,39 @@
+#coding=utf-8
+import os
+import random
+
+trainval_percent = 0.9
+train_percent = 0.8
+#路径修改为自己的路径
+dir_pre="/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/"
+xmlfilepath = '/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Annotations/'
+txtsavepath = '/media/hp/tyw/COCW_DATA/DetectionPatches_256x256/VOC2007/Main/'
+total_xml = os.listdir(xmlfilepath)
+
+num = len(total_xml)
+list = range(num)
+tv = int(num * trainval_percent)
+tr = int(tv * train_percent)
+trainval = random.sample(list, tv)
+train = random.sample(trainval, tr)
+
+ftrainval = open(os.path.join(dir_pre,'ImageSets/Main/trainval.txt'), 'w')
+ftest = open(os.path.join(dir_pre,'ImageSets/Main/test.txt'), 'w')
+ftrain = open(os.path.join(dir_pre,'ImageSets/Main/train.txt'), 'w')
+fval = open(os.path.join(dir_pre,'ImageSets/Main/val.txt'), 'w')
+
+for i in list:
+    name = total_xml[i][:-4] + '\n'
+    if i in trainval:
+        ftrainval.write(name)
+        if i in train:
+            ftrain.write(name)
+        else:
+            fval.write(name)
+    else:
+        ftest.write(name)
+
+ftrainval.close()
+ftrain.close()
+fval.close()
+ftest.close()
diff --git a/preprocessing/ssd_vgg_preprocessing.py b/preprocessing/ssd_vgg_preprocessing.py
index 413ad3428..b74d5d6c8 100644
--- a/preprocessing/ssd_vgg_preprocessing.py
+++ b/preprocessing/ssd_vgg_preprocessing.py
@@ -119,7 +119,12 @@ def apply_with_random_selector(x, func, num_cases):
             func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
             for case in range(num_cases)])[0]
 
+#通过下面的两种操作，程序可以通过一张训练图衍生出许多训练样本，通过将训练图像进行预处理，训练得到的神经网络模型可以识别不同大小，方位，色彩等方面的实体。
+#操作包括：distort_color和distorted_bounding_box_crop两个操作
+#通过下面两种设置然后我们将可以得到很多的训练样本，要不真是的训练样本实际是不够的！
 
+#distort_color用于随机的调整图像的色彩，调整亮度，对比度，饱和色相的顺序会影响最后的结果，
+#定义多种不同的顺序，随机选择，可以进一步降低无关因素对模型的影响
 def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
     """Distort the color of a Tensor image.
 
@@ -173,6 +178,9 @@ def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
         return tf.clip_by_value(image, 0.0, 1.0)
 
 
+
+#对图片进行预处理，将图片转化成神经网络的输入层数据。
+#表示随机裁剪图片，仅仅生成一个cropped_image
 def distorted_bounding_box_crop(image,
                                 labels,
                                 bboxes,
@@ -182,11 +190,14 @@ def distorted_bounding_box_crop(image,
                                 max_attempts=200,
                                 clip_bboxes=True,
                                 scope=None):
-    """Generates cropped_image using a one of the bboxes randomly distorted.
+    """
+    #注意这个函数的解析表明的是使用其中一个随机扭曲的bbox生成cropped_image
+    Generates cropped_image using a one of the bboxes randomly distorted.
 
     See `tf.image.sample_distorted_bounding_box` for more documentation.
 
     Args:
+        #观察这些参数可以发现，这个函数实际是对一张图片的多个gt bbox中随机选择一个（暂时理解为ground truth bbox）进行随机扭曲，返回cropped_image和bbox等等
         image: 3-D Tensor of image (it will be converted to floats in [0, 1]).
         bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
             where each coordinate is [0, 1) and the coordinates are arranged
@@ -203,12 +214,21 @@ def distorted_bounding_box_crop(image,
             region of the image of the specified constraints. After `max_attempts`
             failures, return the entire image.
         scope: Optional scope for name_scope.
+    #注意返回值是一个tuple，分别是cropped_image和distorted bbox，这个我们主要参考tf.image.sample_distorted_bounding_box的实现
     Returns:
         A tuple, a 3-D Tensor cropped_image and the distorted bbox
     """
     with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):
         # Each bounding box has shape [1, num_boxes, box coords] and
         # the coordinates are ordered [ymin, xmin, ymax, xmax].
+        # 为什么要用sample_distorted_bounding_box的原因在于可以随机的截取图片中一个块，减小需要关注的物体大小对图像识别算法的影响
+        # tf.image.sample_distorted_bounding_box的讲解主要参考：https://blog.csdn.net/tz_zs/article/details/77920116
+        # 需要注意的是，返回值的类型为：
+        # begin： 和 image_size 具有相同的类型。包含 [offset_height, offset_width, 0] 的一维数组。作为 tf.slice 的输入。
+        # size： 和 image_size 具有相同的类型。包含 [target_height, target_width, -1] 的一维数组。作为 tf.slice 的输入。
+        # 根据begin，size两个参数我们可以tf.slice出来我们所需要的裁剪出来的小图，而bboxes主要用于在图像上面的显示bbox工作！！！
+        # 那么为什么bboxes的shape为[1,1,4]呢？是不是因为tf.image.sample_distorted_bounding_box函数仅裁出来了一个bbox呢？
+        # bboxes：shape为 [1, 1, 4] 的三维矩阵，数据类型为float32，表示随机变形后的边界框。作为 tf.image.draw_bounding_boxes 的输入。
         bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(
                 tf.shape(image),
                 bounding_boxes=tf.expand_dims(bboxes, 0),
@@ -220,6 +240,8 @@ def distorted_bounding_box_crop(image,
         distort_bbox = distort_bbox[0, 0]
 
         # Crop the image to the specified bounding box.
+        # 注意tf.slice中的begin参数和size参数，begin.shape[-1]=0，size.shape[-1]=-1，可以由tf.image.sample_distorted_bouning_box中确定，
+        # 然后我们就可以从图像中裁剪我们所期望的小图！
         cropped_image = tf.slice(image, bbox_begin, bbox_size)
         # Restore the shape since the dynamic slice loses 3rd dimension.
         cropped_image.set_shape([None, None, 3])
@@ -229,9 +251,14 @@ def distorted_bounding_box_crop(image,
         labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,
                                                    threshold=BBOX_CROP_OVERLAP,
                                                    assign_negative=False)
+        #注意我们的返回值cropped_image的shape为[None,None,３],不用担心，在后面preprocess_for_train中我们会怎样呢，对了没错，
+        #resize到ｓｓｄ所需要的大小，所以不用担心哈！
         return cropped_image, labels, bboxes, distort_bbox
 
 
+#实际上preprocess_for_train的原因在与缺少训练样本，我们进行处理之后可以增加训练样本，用于训练！
+#具体这一块可以参考：TensorFlow图像预处理完整样例　https://blog.8hfq.com/?p=455博客，有详细的记录！
+#对训练集的预处理！
 def preprocess_for_train(image, labels, bboxes,
                          out_shape, data_format='NHWC',
                          scope='ssd_preprocessing_train'):
@@ -239,7 +266,7 @@ def preprocess_for_train(image, labels, bboxes,
 
     Note that the actual resizing scale is sampled from
         [`resize_size_min`, `resize_size_max`].
-
+    ＃注意底下所给的参数和上面提供的参数不一致，因此我们在程序中关注它的实际参数就可以了！
     Args:
         image: A `Tensor` representing an image of arbitrary size.
         output_height: The height of the image after preprocessing.
@@ -273,15 +300,19 @@ def preprocess_for_train(image, labels, bboxes,
                                         min_object_covered=MIN_OBJECT_COVERED,
                                         aspect_ratio_range=CROP_RATIO_RANGE)
         # Resize image to output size.
+        #因为distorted_bounding_box_crop返回的图像我们都已经set_shape为了[None,None,3]，我们需要将其调整为网络所需要的输入大小，
+        #所以统一resize为out_shape大小！！！
         dst_image = tf_image.resize_image(dst_image, out_shape,
                                           method=tf.image.ResizeMethod.BILINEAR,
                                           align_corners=False)
         tf_summary_image(dst_image, bboxes, 'image_shape_distorted')
 
         # Randomly flip the image horizontally.
+        #随机左右翻转图像
         dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)
 
         # Randomly distort the colors. There are 4 ways to do it.
+        #　使用一种随机的顺序调整图像的色彩！！！
         dst_image = apply_with_random_selector(
                 dst_image,
                 lambda x, ordering: distort_color(x, ordering, fast_mode),
@@ -289,14 +320,17 @@ def preprocess_for_train(image, labels, bboxes,
         tf_summary_image(dst_image, bboxes, 'image_color_distorted')
 
         # Rescale to VGG input scale.
+        # 注意dst_image的输出为0~1.0之间，我们需要进行调整恢复为０~255.0作为VGG网络的输入！
         image = dst_image * 255.
+        #对图像进行白化操作！
         image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
         # Image data format.
         if data_format == 'NCHW':
             image = tf.transpose(image, perm=(2, 0, 1))
         return image, labels, bboxes
 
-
+#对验证集的预处理！可以看出image为任意大小的图片，但是呢，仅仅只有一张，an image而已，所以我们可以看出tf_image.resize_image_bboxes_with_crop_or_pad
+#输入的图片只是一个而已，shape为[None,None,3]
 def preprocess_for_eval(image, labels, bboxes,
                         out_shape=EVAL_SIZE, data_format='NHWC',
                         difficults=None, resize=Resize.WARP_RESIZE,
diff --git a/preprocessing/tf_image.py b/preprocessing/tf_image.py
index a96262661..321679bd0 100644
--- a/preprocessing/tf_image.py
+++ b/preprocessing/tf_image.py
@@ -163,6 +163,7 @@ def bboxes_crop_or_pad(bboxes,
         return bboxes
 
 
+#通过中心裁剪或者填充将图片的shape设置为[target_height,target_width,channel]
 def resize_image_bboxes_with_crop_or_pad(image, bboxes,
                                          target_height, target_width):
     """Crops and/or pads an image to a target width and height.
@@ -231,8 +232,11 @@ def equal_(x, y):
         offset_pad_height = max_(height_diff // 2, 0)
 
         # Maybe crop if needed.
+        # 提供的图片大于target_height,target_width,因此我们进行裁剪
         height_crop = min_(target_height, height)
         width_crop = min_(target_width, width)
+        #tf.image.crop_to_bounding_box中的offset_crop_height, offset_crop_width是相对于left-top处的偏移
+        #height_crop,width_crop是要截取的高和宽，因为crop_to_bounding_box是height,width进行输出的，因此我们使用这种形式！！！
         cropped = tf.image.crop_to_bounding_box(image, offset_crop_height, offset_crop_width,
                                                 height_crop, width_crop)
         bboxes = bboxes_crop_or_pad(bboxes,
@@ -240,6 +244,11 @@ def equal_(x, y):
                                     -offset_crop_height, -offset_crop_width,
                                     height_crop, width_crop)
         # Maybe pad if needed.
+        #如果提供的图片的宽高小于height,width的话，我们需要进行填充操作
+        '''Adds `offset_height` rows of zeros on top, `offset_width` columns of
+        zeros on the left, and then pads the image on the bottom and right
+        with zeros until it has dimensions `target_height`, `target_width`.'''
+
         resized = tf.image.pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width,
                                                target_height, target_width)
         bboxes = bboxes_crop_or_pad(bboxes,
diff --git a/preprocessing/vgg_preprocessing.py b/preprocessing/vgg_preprocessing.py
index a2d0f8644..f7aa98e04 100644
--- a/preprocessing/vgg_preprocessing.py
+++ b/preprocessing/vgg_preprocessing.py
@@ -226,7 +226,8 @@ def _mean_image_subtraction(image, means):
     num_channels = image.get_shape().as_list()[-1]
     if len(means) != num_channels:
         raise ValueError('len(means) must match the number of channels')
-
+    #tf.split函数的作用：将后面tensor按照第几个维度划分成几个tensor，注意划分后的tensor的维度可以不同，所num_channels如果是数字，那就都相同，
+    #为数组的时候一般是不同的，如原来shape[2]为4，现在num_channels=[1,2,1]那么就不同，如果为[1,1,1,1]则等价于num_channels=4.
     channels = tf.split(2, num_channels, image)
     for i in range(num_channels):
         channels[i] -= means[i]
diff --git a/train_ssd_network.py b/train_ssd_network.py
index c93e2b4a5..5438a7896 100644
--- a/train_ssd_network.py
+++ b/train_ssd_network.py
@@ -229,6 +229,8 @@ def main(_):
                     common_queue_min=10 * FLAGS.batch_size,
                     shuffle=True)
             # Get for SSD network: image, labels, bboxes.
+            #slim.dataset_data_provider.DatasetDataProvider解读，https://blog.csdn.net/weixin_35653315/article/details/71023596
+            #一次只返回一个img，需要组成batch进行训练！！！
             [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                              'object/label',
                                                              'object/bbox'])
@@ -238,6 +240,7 @@ def main(_):
                                        out_shape=ssd_shape,
                                        data_format=DATA_FORMAT)
             # Encode groundtruth labels and bboxes.
+            #注意此时仅仅只送入了一张图片进去，only one！！！
             gclasses, glocalisations, gscores = \
                 ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)
             batch_shape = [1] + [len(ssd_anchors)] * 3
@@ -264,10 +267,23 @@ def clone_fn(batch_queue):
             """Allows data parallelism by creating multiple
             clones of network_fn."""
             # Dequeue batch.
+            
+            '''
+            ssd的核心代码在这一块，我们可以看到
+            1)编码真实的标签，相当于y_label:gclasses, glocalisations, gscores = \
+                ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)
+            2) b_image, b_gclasses, b_glocalisations, b_gscores = \
+                tf_utils.reshape_list(batch_queue.dequeue(), batch_shape)
+            3)得到输出，相当于得到y_pred: predictions, localisations, logits, end_points = \
+                    ssd_net.net(b_image, is_training=True)
+            4)计算损失： predictions, localisations, logits, end_points = \
+                    ssd_net.net(b_image, is_training=True)
+            '''
             b_image, b_gclasses, b_glocalisations, b_gscores = \
                 tf_utils.reshape_list(batch_queue.dequeue(), batch_shape)
 
             # Construct SSD network.
+            
             arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay,
                                           data_format=DATA_FORMAT)
             with slim.arg_scope(arg_scope):