From a5914b5a2de166903410da2b99b7a866307e39aa Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 9 Aug 2019 14:42:58 -0400
Subject: [PATCH 01/55] Add script to convert YC2-BB annotations

---
 datasets/scripts/gen_json_yc2bb.py | 90 ++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 datasets/scripts/gen_json_yc2bb.py

diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py
new file mode 100644
index 0000000..337a156
--- /dev/null
+++ b/datasets/scripts/gen_json_yc2bb.py
@@ -0,0 +1,90 @@
+#Convert YC2-BB JSON annotation files to ViP JSON format
+
+import os 
+import json
+
+source_root = '$ANNOTATIONS_ROOT/annotations' #replace this value
+target_root = '$JSON_TARGET_ROOT' #replace this value
+#Link to videos sampled at 1 fps
+frame_root  = '$SAMPLED_FRAMES_ROOT' #replace this value
+files = ['yc2_training_vid.json', 'yc2_bb_val_annotations.json', 'yc2_bb_public_test_annotations.json']
+
+splits    = ['train', 'val', 'test']
+ann_files = [os.path.join(source_root, f) for f in files]
+
+for split, ann_file in zip(splits, ann_files):
+
+
+    #YC2 split names, slightly different
+    if split == 'train':
+        split_name = 'training'
+    elif split == 'val':
+        split_name = 'validation'
+    else:
+        split_name = 'testing'
+    
+    with open(ann_file) as f:
+        ann_json_data = json.load(f)
+
+    yc2_json_data = ann_json_data['database']
+    json_data = []
+
+    for vid_name in yc2_json_data.keys():
+        frm_height  = yc2_json_data[vid_name]['rheight']
+        frm_width   = yc2_json_data[vid_name]['rwidth']
+        recipe_type = yc2_json_data[vid_name]['recipe_type'] 
+        yc2_segments = yc2_json_data[vid_name]['segments']
+        
+        #Loop through segments, YC2 breaks down all each video into segment clips
+        for seg,item in sorted(yc2_segments.items()):
+            base_path   = os.path.join(frame_root, split_name, recipe_type, vid_name, str(seg).zfill(2))
+            frames = []
+            if 'objects' in item: #validation or testing file
+                num_objs   = len(item['objects'])
+                num_frames = len(item['objects'][0]['boxes'])
+
+                #Loop through frames
+                for f in range(num_frames):
+                    frame = {}
+                    objs = []
+
+                    #Loop through objects
+                    for track_id in range(num_objs):
+                        obj = item['objects'][track_id]
+
+                        cls_name = obj['label']
+                        box    = obj['boxes'][f]
+                        
+                        if len(box) == 0: #No annotations
+                            continue 
+
+                        xmin = box['xtl']
+                        ymin = box['ytl']
+                        xmax = box['xbr']
+                        ymax = box['ybr']
+
+                        outside  = box['outside'] #outside or inside of frame
+                        occluded = box['occluded'] 
+
+                        objs.append({'trackid':track_id, 'c':cls_name, 'occ':occluded, 'outside':outside, 'bbox':[xmin, ymin, xmax, ymax]})
+
+                    frame['img_path'] = os.path.join(base_path, str(seg).zfill(2), str(f).zfill(2)+'.jpg') 
+                    frame['objs']     = objs 
+                    frame['seg']      = seg
+                    frames.append(frame) 
+            else: #training annotation file
+                frame = {}
+                objs = []
+
+                frame['sentence'] = yc2_segments[seg]['sentence'] 
+                frame['objs']     = objs 
+                frame['seg']      = seg
+                frames.append(frame) 
+
+            json_data.append({'frames':frames, 'base_path':base_path, 'frame_size':[frm_width, frm_height], 'recipe_type':recipe_type})
+
+    target_file = os.path.join(target_root, split+'.json')
+    print('Writing out to: {}'.format(target_file))
+    with open(target_file, 'w') as f:
+        json.dump(json_data, f)
+        

From 551cdc22618ed9c0d61bf2fa480c463eff01f529 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sat, 10 Aug 2019 17:54:58 -0400
Subject: [PATCH 02/55] YC2BB dataset configured for validation and testing
 splits

---
 datasets/YC2BB.py                  | 230 +++++++++++++++++++++++++++++
 datasets/scripts/gen_json_yc2bb.py |   8 +-
 2 files changed, 232 insertions(+), 6 deletions(-)
 create mode 100644 datasets/YC2BB.py

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
new file mode 100644
index 0000000..6a54ae0
--- /dev/null
+++ b/datasets/YC2BB.py
@@ -0,0 +1,230 @@
+#Adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text
+import torch
+from .abstract_datasets import DetectionDataset 
+from PIL import Image
+import cv2
+import os
+import csv
+import numpy as np
+
+import torchtext
+
+class YC2BB(DetectionDataset):
+    '''
+    YouCook2-Bounding Boxes dataset. Used in weakly-supervised video object grounding task
+    Paper: https://arxiv.org/pdf/1805.02834.pdf
+    '''
+    def __init__(self, *args, **kwargs):
+        super(YC2BB, self).__init__(*args, **kwargs)
+        
+        #Define the following configuration parameters in your config_*.yaml file
+        #Or as a system arg
+        class_file           = kwargs['yc2bb_class_file']
+        num_proposals        = kwargs['yc2bb_num_proposals']
+        rpn_proposal_root    = kwargs['yc2bb_rpn_proposal_root']
+        roi_pooled_feat_root = kwargs['yc2bb_roi_pooled_feat_root']
+
+        self.load_type = kwargs['load_type']
+
+        self.max_objects = 15 
+        self.class_dict  = _get_class_labels(class_file)
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+
+        sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type)
+
+        assert(len(sentences_proc) == len(segments_tuple))
+
+        #YC2 split names, slightly different
+        split_to_split = {'train':'training','val':'validation','test':'testing'}
+        self.yc2_split = split_to_split[self.load_type]
+
+	# read rpn object proposals
+        self.rpn_dict = {}
+        self.rpn_chunk = []
+
+        total_num_proposals = 100 # always load all the proposals we have
+        rpn_lst_file = os.path.join(rpn_proposal_root, self.yc2_split+'-box-'+str(total_num_proposals)+'.txt')
+        rpn_chunk_file = os.path.join(rpn_proposal_root, self.yc2_split+'-box-'+str(total_num_proposals)+'.pth')
+        key_counter = len(self.rpn_dict)
+        with open(rpn_lst_file) as f:
+            rpn_lst = f.readline().split(',')
+            self.rpn_dict.update({r.strip():(i+key_counter) for i,r in enumerate(rpn_lst)})
+
+        self.rpn_chunk.append(torch.load(rpn_chunk_file))
+
+        self.rpn_chunk = torch.cat(self.rpn_chunk).cpu()
+        assert(self.rpn_chunk.size(0) == len(self.rpn_dict))
+        assert(self.rpn_chunk.size(2) == 4)
+
+        self.num_proposals = num_proposals
+        self.roi_pooled_feat_root = roi_pooled_feat_root
+
+        '''
+        with open(self.gt_box_file, 'r') as f:
+            self.data_all = json.load(f)
+
+	 # read gt bounding boxes O x T/25 x (id, ytl, xtl, ybr, xbr)
+        # coordinates are 0-indexed
+        for i, t in enumerate(segments_tuple):
+            vid = t[2]
+            seg = str(int(t[3]))
+
+            # if video has no annotations, continue
+            if not vid in self.data_all['database']:
+                continue
+
+            # check if ground truth bounding box exists for segment
+            if seg in self.data_all['database'][vid]['segments'].keys():
+                s = sentences_proc[i]
+                inc_flag = 0
+                obj_label = []
+                for w in s:
+                    if self.class_dict.get(w, -1) >= 0:
+                        obj_label.append(self.class_dict[w])
+                        inc_flag = 1
+
+                if inc_flag:
+                    self.sample_lst.append((t, obj_label))
+
+        print('# of segments for {}: {}, percentage in the raw data: {:.2f}'.format(
+               split_lst, len(self.sample_lst), len(self.sample_lst)/len(sentences_proc)))
+        '''
+
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+        
+        base_path       = vid_info['base_path']
+        width, height   = vid_info['frame_size']
+        num_frames_1fps = len(vid_info['frames'])
+        rec             = base_path.split('/')[-3]
+        vid             = base_path.split('/')[-2]
+        seg             = base_path.split('/')[-1]
+
+        bbox_data   = np.zeros((self.clip_length, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax]
+        labels      = np.zeros((self.clip_length, self.max_objects))-1
+
+        for frame_ind in range(num_frames_1fps):
+            frame      = vid_info['frames'][frame_ind]
+            #frame_path = frame['img_path']
+            num_objs        = len(frame['objs'])
+            obj_label   = np.zeros((num_objs))-1 #List all unique class ids in entire segment
+            
+            # Extract bbox and label data from video info
+            for obj_ind, obj in enumerate(frame['objs']):
+                label = self.class_dict[obj['c']]
+                obj_label[obj_ind] = label
+
+                if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated
+                    bbox_data[frame_ind, trackid] = [label, -1, -1, -1, -1] 
+                else:
+                    trackid   = obj['trackid']
+                    obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
+                    difficult = obj['difficult']
+                    
+                    bbox_data[frame_ind, trackid, :] = [label] + obj_bbox
+                    labels[frame_ind, trackid]       = label 
+                    diff_labels[frame_ind, trackid]  = difficult 
+
+            #input_data.append(cv2.imread(os.path.join(base_path, frame_path), cv2.IMREAD_COLOR)[:,:,(2,1,0)])
+        
+        obj_label = torch.from_numpy(obj_label)
+        num_frames = num_frames_1fps * 25 #video sampled at 25 fps
+        
+        '''
+	if self.vis_output:
+            image_path = os.path.join(self.image_root, split, rec, vid, seg)
+            img_notrans = []
+            for i in range(num_frames):
+                img_notrans.append(self.spatial_transform_notrans(self.loader(os.path.join(image_path, '{:04d}.jpg'.format(i+1)))))
+            img_notrans = torch.stack(img_notrans, dim=1) # 3, T, H, W
+        else:
+            # no need to load raw images
+            img_notrans = torch.zeros(3, num_frames, 1, 1) # dummy
+        '''	
+
+	# rpn object propoals
+        rpn = []
+        x_rpn = []
+        frm=1
+
+        feat_name = vid+'_'+seg+'.pth'
+        img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+        x_rpn = torch.load(os.path.join(self.roi_pooled_feat_root, self.yc2_split, feat_name))
+        while self.rpn_dict.get(img_name, -1) > -1:
+            ind = self.rpn_dict[img_name]
+            rpn.append(self.rpn_chunk[ind])
+            frm+=1
+            img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+
+        rpn = torch.stack(rpn) # number of frames x number of proposals per frame x 4
+        rpn = rpn[:, :self.num_proposals, :]
+
+        x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals
+        x_rpn = x_rpn[:, :, :self.num_proposals]
+
+        rpn_original = rpn-1 # convert to 1-indexed
+
+        # normalize coordidates to 0-1
+        # coordinates are 1-indexed:  (x_tl, y_tl, x_br, y_br)
+        rpn[:, :, 0] = (rpn[:, :, 0]-0.5)/width
+        rpn[:, :, 2] = (rpn[:, :, 2]-0.5)/width
+        rpn[:, :, 1] = (rpn[:, :, 1]-0.5)/height
+        rpn[:, :, 3] = (rpn[:, :, 3]-0.5)/height
+
+        assert(torch.max(rpn) <= 1)
+
+        vis_name = '_-_'.join((self.yc2_split, rec, vid, seg))
+
+        ret_dict = dict()
+        ret_dict['data']     = (x_rpn, obj_label) 
+
+        annot_dict = dict()
+        annot_dict['box']          = bbox_data 
+        annot_dict['box_label']    = labels
+        annot_dict['rpn']          = rpn
+        annot_dict['rpn_original'] = rpn_original 
+        annot_dict['vis_name']     = vis_name
+        ret_dict['annots']         = annot_dict
+
+        return ret_dict
+
+def _get_segments_and_sentences(data, split):
+    # build vocab and tokenized sentences
+    text_proc = torchtext.data.Field(sequential=True, tokenize='spacy',
+                                lower=True, batch_first=True)
+    split_sentences = []
+    split_segments = []
+
+    for dat in data:
+        rec    = dat['base_path'].split('/')[-3]
+        vid    = dat['base_path'].split('/')[-2]
+        seg    = dat['base_path'].split('/')[-1]
+        frame = dat['frames'][0]
+        segment_labels = []
+        if 'sentence' in frame: # for now, training json file only contains full sentence
+            segment_labels = frame['sentence']
+        else:
+            for obj in frame['objs']:
+                segment_labels.append(obj['c'])
+        split_sentences.append(segment_labels)
+        split_segments.append((split, rec, vid, str(seg).zfill(2))) #tuple of id (split, vid, seg)
+
+    sentences_proc = list(map(text_proc.preprocess, split_sentences)) # build vocab on train and val
+
+    print('{} sentences in {} split'.format(len(sentences_proc), split))
+
+    return sentences_proc, split_segments
+
+def _get_class_labels(class_file):
+    class_dict = {} # both singular form & plural form are associated with the same label
+    with open(class_file) as f:
+        cls = csv.reader(f, delimiter=',')
+        for i, row in enumerate(cls):
+            for r in range(1, len(row)):
+                if row[r]:
+                    class_dict[row[r]] = int(row[0])
+
+    return class_dict
diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py
index 337a156..2d0baf6 100644
--- a/datasets/scripts/gen_json_yc2bb.py
+++ b/datasets/scripts/gen_json_yc2bb.py
@@ -16,12 +16,8 @@
 
 
     #YC2 split names, slightly different
-    if split == 'train':
-        split_name = 'training'
-    elif split == 'val':
-        split_name = 'validation'
-    else:
-        split_name = 'testing'
+    split_to_split = {'train':'training','val':'validation','test':'testing'}
+    split_name = split_to_split[split]
     
     with open(ann_file) as f:
         ann_json_data = json.load(f)

From 3745172ad40facb5825a0705c62c389c4ed6e5a6 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Tue, 13 Aug 2019 09:57:28 -0400
Subject: [PATCH 03/55] Update json template frame size, fix num_clip=0 bug

---
 datasets/abstract_datasets.py                       | 8 +++++++-
 datasets/templates/action_recognition_template.json | 4 ++--
 datasets/templates/detection_template.json          | 6 +++---
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py
index 2d216f0..6e7fe2e 100644
--- a/datasets/abstract_datasets.py
+++ b/datasets/abstract_datasets.py
@@ -71,6 +71,7 @@ def _extractClips(self, video):
         if self.num_clips < 0:
             if len(video) >= self.clip_length:
                 final_video = [video[_idx] for _idx in np.linspace(0, len(video)-1, self.clip_length, dtype='int32')]
+                final_video = [final_video]
 
             else:
                 # Loop if insufficient elements
@@ -80,6 +81,7 @@ def _extractClips(self, video):
                 indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')]
 
                 final_video = [video[_idx] for _idx in indices]
+                final_video = [final_video]
 
 
             # END IF
@@ -103,6 +105,7 @@ def _extractClips(self, video):
                 indices = indices[:self.clip_length]
 
                 final_video = [video[_idx] for _idx in indices]
+                final_video = [final_video]
 
             # END IF                               
     
@@ -114,6 +117,7 @@ def _extractClips(self, video):
                     indices = np.arange(indices, indices + self.clip_length).astype('int32') 
 
                     final_video = [video[_idx] for _idx in indices]
+                    final_video = [final_video]
 
                 else:
                     indices = np.ceil(self.clip_length/float(len(video)))
@@ -125,17 +129,19 @@ def _extractClips(self, video):
                     indices = indices[index:index + self.clip_length]
 
                     final_video = [video[_idx] for _idx in indices]
+                    final_video = [final_video]
 
                 # END IF
 
             else:
                 final_video = video[:self.clip_length]
+                final_video = [final_video]
 
             # END IF
 
         # END IF
 
-        return [final_video]
+        return final_video
 
 
         
diff --git a/datasets/templates/action_recognition_template.json b/datasets/templates/action_recognition_template.json
index dc8acf9..6696368 100644
--- a/datasets/templates/action_recognition_template.json
+++ b/datasets/templates/action_recognition_template.json
@@ -2,7 +2,6 @@
     {
         "frames (list)": [
             {
-                "frame_size (int, int)": "(WIDTH,HEIGHT)",
                 "img_path (str)": "FRAME_PATH",
                 "actions (list)": [
                     {
@@ -11,6 +10,7 @@
                 ]
             }
         ],
+        "frame_size (int, int)": "(WIDTH,HEIGHT)",
         "base_path (str)": "BASE_VID_PATH"
     }
-]
\ No newline at end of file
+]
diff --git a/datasets/templates/detection_template.json b/datasets/templates/detection_template.json
index f4b8a42..b8ed90a 100644
--- a/datasets/templates/detection_template.json
+++ b/datasets/templates/detection_template.json
@@ -2,7 +2,6 @@
     {
         "frames (list)": [
             {
-                "frame_size (int, int)": "(WIDTH,HEIGHT)",
                 "img_path (str)": "FRAME_PATH",
                 "objs (list)": [
                     {
@@ -14,6 +13,7 @@
                 ]
             }
         ],
-        "base_path (str)": "BASE_VID_PATH"
+        "base_path (str)": "BASE_VID_PATH",
+        "frame_size (int, int)": "(WIDTH,HEIGHT)"
     }
-]
\ No newline at end of file
+]

From a5b90e2b4d7e7ad47fd7a8c011b42cf124db5026 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Tue, 13 Aug 2019 11:30:28 -0400
Subject: [PATCH 04/55] Update README.md

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 81c910f..f1d518a 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ A platform for quick and easy development of deep learning networks for recognit
 * [Development](#development)
   * [Add a Model](#add-a-model)
   * [Add a Dataset](#add-a-dataset)
-* [Version History](#version-history)
+* [FAQ](#faq)
 
 ## Configured Datasets
 |   Dataset      |        Task(s)           |
@@ -114,3 +114,7 @@ To add a new dataset:
 	* Complete `__init__` and `__getitem__` functions
 	* Example skeleton dataset can be found [here](https://github.com/MichiganCOG/ViP/blob/master/datasets/templates/dataset_template.py)
 
+
+### FAQ
+
+A detailed FAQ can be found on our [wiki](https://github.com/MichiganCOG/ViP/wiki/FAQ).

From 9d4bccf43d7f795570437a877f35fd934185f182 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Tue, 13 Aug 2019 11:32:31 -0400
Subject: [PATCH 05/55] Update README.md

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index f1d518a..3231468 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 
 A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD.
 
+Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
+
 ## Implemented Models and their performance
 
 ### Recognition
@@ -88,6 +90,9 @@ Ex: From the root directory of ViP, train the action recognition network C3D on
 ```
 python train.py --cfg_file models/c3d/config_train.yaml
 ```
+
+Additional examples can be found on our [wiki.](https://github.com/MichiganCOG/ViP/wiki)
+
 ## Development
 
 New models and datasets can be added without needing to rewrite any training, evaluation, or data loading code.
@@ -103,6 +108,8 @@ To add a new model:
 
 Examples of previously implemented models can be found [here](https://github.com/MichiganCOG/ViP/tree/master/models).
 
+Additional information can be found on our [wiki.](https://github.com/MichiganCOG/ViP/wiki)
+
 ### Add a Dataset
 
 To add a new dataset:
@@ -114,6 +121,7 @@ To add a new dataset:
 	* Complete `__init__` and `__getitem__` functions
 	* Example skeleton dataset can be found [here](https://github.com/MichiganCOG/ViP/blob/master/datasets/templates/dataset_template.py)
 
+Additional information can be found on our [wiki.](https://github.com/MichiganCOG/ViP/wiki)
 
 ### FAQ
 

From fbd0a83601589abf3795fc6a7514eae645164c95 Mon Sep 17 00:00:00 2001
From: Madan <madantrg@umich.edu>
Date: Tue, 13 Aug 2019 16:01:14 -0400
Subject: [PATCH 06/55] feature extraction feature trial

---
 datasets/KTH.py   | 77 +++++++++++++++++++++++++++++++++++++++++++++++
 eval.py           | 27 +++++++++++++++--
 models/c3d/c3d.py |  5 ++-
 3 files changed, 106 insertions(+), 3 deletions(-)
 create mode 100644 datasets/KTH.py

diff --git a/datasets/KTH.py b/datasets/KTH.py
new file mode 100644
index 0000000..95995ef
--- /dev/null
+++ b/datasets/KTH.py
@@ -0,0 +1,77 @@
+import torch
+from .abstract_datasets import RecognitionDataset 
+from PIL import Image
+import cv2
+import os
+import numpy as np
+from torchvision import transforms
+
+class KTH(RecognitionDataset):
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize KTH class  
+        Args:
+            load_type    (String): Select training or testing set 
+            resize_shape (Int):    [Int, Int] Array indicating desired height and width to resize input
+            crop_shape   (Int):    [Int, Int] Array indicating desired height and width to crop input
+            final_shape  (Int):    [Int, Int] Array indicating desired height and width of input to deep network
+            preprocess   (String): Keyword to select different preprocessing types            
+
+        Return:
+            None
+        """
+        super(KTH, self).__init__(*args, **kwargs)
+
+        self.load_type    = kwargs['load_type']
+        self.resize_shape = kwargs['resize_shape']
+        self.crop_shape   = kwargs['crop_shape']
+        self.final_shape  = kwargs['final_shape']
+        self.preprocess   = kwargs['preprocess']
+        
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+
+
+    def __getitem__(self, idx):
+        vid_info  = self.samples[idx]
+        base_path = vid_info['base_path']
+
+        input_data = []
+        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((self.clip_length))-1
+        input_data = []
+    
+        for frame_ind in range(len(vid_info['frames'])):
+            frame_path   = os.path.join(base_path, vid_info['frames'][frame_ind]['img_path'])
+
+            for frame_labels in vid_info['frames'][frame_ind]['actions']:
+                labels[frame_ind] = frame_labels['action_class']
+
+            # Load frame image data and preprocess image accordingly
+            input_data.append(cv2.imread(frame_path)[...,::-1]/1.)
+
+
+        # Preprocess data
+        vid_data   = self.transforms(input_data)
+        labels     = torch.from_numpy(labels).float()
+
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+
+        ret_dict           = dict() 
+        ret_dict['data']   = vid_data 
+
+        annot_dict           = dict()
+        annot_dict['labels'] = labels
+
+        ret_dict['annots']   = annot_dict
+
+        return ret_dict
+
+
+#dataset = HMDB51(json_path='/z/dat/HMDB51', dataset_type='train', clip_length=100, num_clips=0)
+#dat = dataset.__getitem__(0)
+#import pdb; pdb.set_trace()
diff --git a/eval.py b/eval.py
index 2d5ed2e..41ab417 100644
--- a/eval.py
+++ b/eval.py
@@ -67,6 +67,9 @@ def eval(**args):
     if args['load_type'] == 'train_val':
         eval_loader = loader['valid']
 
+    elif args['load_type'] == 'train':
+        eval_loader = loader['train']
+
     elif args['load_type'] == 'test':
         eval_loader  = loader['test'] 
 
@@ -88,14 +91,28 @@ def eval(**args):
     # Setup Model To Evaluate 
     model.eval()
 
+    ret_data   = None
+    ret_labels = None
+
     with torch.no_grad():
         for step, data in enumerate(eval_loader):
             x_input     = data['data'].to(device)
             annotations = data['annots']
 
-            outputs = model(x_input)
+            outputs = model(x_input, features=True)
+
+            if ret_data is None:
+                ret_data   = outputs.cpu().numpy()
+                ret_labels = annotations['labels'].cpu().numpy()[:, 0]
+
+            else:
+                ret_data   = np.vstack((ret_data, outputs.cpu().numpy()))
+                ret_labels = np.hstack((ret_labels, annotations['labels'].cpu().numpy()[:, 0]))
 
-            acc = acc_metric.get_accuracy(outputs, annotations)
+            # END IF
+
+
+            #acc = acc_metric.get_accuracy(outputs, annotations)
 
             if step % 100 == 0:
                 print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc))
@@ -103,6 +120,12 @@ def eval(**args):
     print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc))
 
     if not args['debug']:
+        ret_dict = {}
+        ret_dict['data']   = ret_data
+        ret_dict['labels'] = ret_labels
+        import scipy.io as sio
+        sio.savemat(args['load_type']+'_'+args['dataset']+'.mat', ret_dict)
+
         writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc)
         # Close Tensorboard Element
         writer.close()
diff --git a/models/c3d/c3d.py b/models/c3d/c3d.py
index 69e4a74..387ffd7 100644
--- a/models/c3d/c3d.py
+++ b/models/c3d/c3d.py
@@ -55,7 +55,7 @@ def __init__(self, **kwargs):
         if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']:
             self.__load_pretrained_weights()
 
-    def forward(self, x, labels=False):
+    def forward(self, x, labels=False, features=False):
         x = self.relu(self.conv1(x))
         x = self.pool1(x)
 
@@ -78,6 +78,9 @@ def forward(self, x, labels=False):
 
         x = self.relu(self.fc6(x))
 
+        if features:
+            return x
+
         x = self.dropout(x)
         x = self.relu(self.fc7(x))
         x = self.dropout(x)

From 252021ce6f9ab13c1ddd9746647df6a709031a82 Mon Sep 17 00:00:00 2001
From: Stephan Lemmer <lemmersj@umich.edu>
Date: Thu, 15 Aug 2019 09:43:09 -0400
Subject: [PATCH 07/55] Adding gradient clipping (issue #5)

---
 config_default_example.yaml | 1 +
 train.py                    | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/config_default_example.yaml b/config_default_example.yaml
index 6094268..52a0f28 100644
--- a/config_default_example.yaml
+++ b/config_default_example.yaml
@@ -37,3 +37,4 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
+grad_max_norm:     0.1                   # Norm for gradient clipping
diff --git a/train.py b/train.py
index 1876c5c..e4845bb 100644
--- a/train.py
+++ b/train.py
@@ -180,6 +180,10 @@ def train(**args):
                     # Apply large mini-batch normalization
                     for param in model.parameters():
                         param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size'])
+                    
+                    # Apply gradient clipping
+                    if ("grad_max_norm" in args) and float(args['grad_max_norm'] > 0):
+                        nn.utils.clip_grad_norm_(model.parameters(),float(args['grad_max_norm']))
                     optimizer.step()
 
 

From 1638e50ab8e2de4f763d10644c9f39923bb0d720 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 15 Aug 2019 14:51:28 -0400
Subject: [PATCH 08/55] Do not scale non-existent gradients

---
 train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index e4845bb..eab0b2c 100644
--- a/train.py
+++ b/train.py
@@ -179,7 +179,8 @@ def train(**args):
                 if (epoch * len(train_loader) + (step+1)) % args['pseudo_batch_loop'] == 0 and step > 0:
                     # Apply large mini-batch normalization
                     for param in model.parameters():
-                        param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size'])
+                        if param.requires_grad:
+                            param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size'])
                     
                     # Apply gradient clipping
                     if ("grad_max_norm" in args) and float(args['grad_max_norm'] > 0):

From 2860dd39d1ad7ec6bd84786e7f08b00f7ea64009 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 15 Aug 2019 14:53:13 -0400
Subject: [PATCH 09/55] set default grad_max_norm to 0

---
 config_default_example.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config_default_example.yaml b/config_default_example.yaml
index 52a0f28..4dad7e4 100644
--- a/config_default_example.yaml
+++ b/config_default_example.yaml
@@ -20,6 +20,7 @@ debug:             0                     # If True, do not plot, save, or create
 epoch:             30                    # Total number of epochs 
 exp:               exp                   # Experiment name
 gamma:             0.1                   # Multiplier with which to change learning rate
+grad_max_norm:     0                     # Norm for gradient clipping
 json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
 labels:            51                    # Number of total classes in the dataset
 load_type:         train                 # Environment selection, to include only training/training and validation/testing dataset
@@ -37,4 +38,3 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
-grad_max_norm:     0.1                   # Norm for gradient clipping

From a54b374c8e88cd717c328274bc7f54c97643a2ac Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 16 Aug 2019 18:04:28 -0400
Subject: [PATCH 10/55] Tweak preprocessing functions to operate on point
 coordinates & add hand keypoint dataset

---
 datasets/Manual_Hands.py             | 125 +++++++++++++++++++++++++++
 datasets/preprocessing_transforms.py | 113 ++++++++++++++++++++----
 2 files changed, 220 insertions(+), 18 deletions(-)
 create mode 100644 datasets/Manual_Hands.py

diff --git a/datasets/Manual_Hands.py b/datasets/Manual_Hands.py
new file mode 100644
index 0000000..d5c1a00
--- /dev/null
+++ b/datasets/Manual_Hands.py
@@ -0,0 +1,125 @@
+import torch
+import torchvision
+from .abstract_datasets import DetectionDataset 
+import cv2
+import os
+import numpy as np
+import json
+
+class Manual_Hands(DetectionDataset):
+    """
+    Manually-annotated keypoints on hands for pose estimation.
+    Includes images from The MPII Human Pose and New Zealand Sign Language (NZSL) datasets
+
+    Source: https://arxiv.org/1704.07809
+    """
+    def __init__(self, *args, **kwargs):
+        super(Manual_Hands, self).__init__(*args, **kwargs)
+
+        self.load_type = kwargs['load_type']
+        self.json_path = kwargs['json_path']
+
+        # Maximum number of annotated object present in a single frame in entire dataset
+        # Dictates the return size of annotations in __getitem__
+        self.max_objects = 1
+        self.sigma       = 3.0
+        self.stride      = 8 #effective stride of the entire network
+
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+
+    #Adapted from: https://github.com/namedBen/Convolutional-Pose-Machines-Pytorch
+    def gaussian_kernel(self, size_w, size_h, center_x, center_y, sigma):
+        #Outputs a gaussian heat map on defined point
+        gridy, gridx = torch.meshgrid(torch.arange(0,size_h,dtype=torch.float), torch.arange(0,size_w,dtype=torch.float))
+        D2 = (gridx - center_x)**2 + (gridy - center_y)**2
+
+        return torch.exp(-0.5 * D2 / sigma**2)
+
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+        
+        base_path = vid_info['base_path']
+        vid_size  = vid_info['frame_size']
+
+        input_data    = []
+        vid_data      = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data     = np.zeros((self.clip_length, self.max_objects, 4))-1
+        hand_pts_data = np.zeros((self.clip_length, self.max_objects, 21, 3))-1
+        labels        = np.zeros((self.clip_length, self.max_objects))-1
+        occlusions    = np.zeros((self.clip_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points
+
+        for frame_ind in range(len(vid_info['frames'])):
+            frame          = vid_info['frames'][frame_ind]
+            width, height  = vid_info['frame_size']
+            frame_path     = frame['img_path']
+            
+            # Extract bbox and label data from video info
+            for obj in frame['objs']:
+                #trackid   = obj['trackid'] #Let's ignore trackid for now, only one annotation per image
+                trackid   = 0
+                label     = 1 if obj['c'] == 'left' else 0 #1: left hand, 0: right hand
+                occluded  = obj['occ']
+                obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
+                body_pts  = obj['body_pts'] #16 points (x,y,valid)
+                hand_pts  = obj['hand_pts'] #21 points (x,y,valid)
+                head_box  = obj['head_box']
+                head_size = obj['head_size'] #max dim of tightest box around head
+                hand_ctr  = obj['hand_ctr']
+                mpii      = obj['mpii']
+
+                #During training square patch is 2.2*B where B is max(obj_bbox)
+                if self.load_type == 'train':
+                    B = max(obj_bbox[2]-obj_bbox[0], obj_bbox[3]-obj_bbox[1])
+                else: #During testing B is 0.7*head_size
+                    B = 0.7*head_size
+
+                hand_size = 2.2*B
+                xtl       = np.clip(int(hand_ctr[0]-hand_size/2), 0, width)
+                ytl       = np.clip(int(hand_ctr[1]-hand_size/2), 0, height)
+                xbr       = np.clip(int(hand_ctr[0]+hand_size/2), 0, width)
+                ybr       = np.clip(int(hand_ctr[1]+hand_size/2), 0, height)
+
+                hand_crop = [xtl, ytl, xbr, ybr]
+                bbox_data[frame_ind, trackid, :]     = obj_bbox
+                labels[frame_ind, trackid]           = label 
+                hand_pts_data[frame_ind, trackid, :] = hand_pts
+                occlusions[frame_ind, trackid]       = occluded + [0] #Add element for background
+
+            # Load frame, convert to RGB from BGR and normalize from 0 to 1
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1])
+
+        #Crop hand and resize, perform same transforms to ground truth keypoints
+        vid_data, hand_pts_coords = self.transforms(input_data, hand_pts_data[:,:,:,:2], hand_crop, labels)
+
+        h_width  = int(self.final_shape[1]/self.stride)
+        h_height = int(self.final_shape[0]/self.stride)
+        heatmaps = torch.zeros((22, h_width, h_height), dtype=torch.float) #heatmaps for 21 keypoints + background
+        for i,pts in enumerate(hand_pts_coords[0][0]):
+            x = pts[0] / self.stride
+            y = pts[1] / self.stride 
+            heatmaps[i,:,:] = self.gaussian_kernel(h_width, h_height, x, y, self.sigma)
+
+        heatmaps[-1,:,:] = 1 - torch.max(heatmaps[:-1,:,:], dim=0)[0] #Last layer is background
+
+        vid_data = vid_data/255
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+        vid_data = vid_data.squeeze(1) #Remove frame dimension, b/c this is an image dataset
+
+        ret_dict = dict() 
+        ret_dict['data']       = vid_data 
+        annot_dict = dict()
+        annot_dict['head_size']    = head_size
+        annot_dict['hand_pts']    = hand_pts_coords 
+        annot_dict['heatmaps']    = heatmaps
+        annot_dict['labels']      = labels
+        annot_dict['occ']         = occlusions 
+        annot_dict['frame_path']  = frame_path 
+        annot_dict['frame_size']  = vid_size #width, height
+        ret_dict['annots']     = annot_dict
+
+        return ret_dict
diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 3f5cc10..34513c5 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -123,6 +123,22 @@ def resize_bbox(self, xmin, ymin, xmax, ymax, img_shape, resize_shape):
     
         return xmin_new, ymin_new, xmax_new, ymax_new 
 
+    def resize_pt_coords(self, x, y, img_shape, resize_shape):
+        # Get relative position for point coords within a frame, after it's resized
+
+        img_h = img_shape[0]
+        img_w = img_shape[1]
+    
+        res_h = resize_shape[0]
+        res_w = resize_shape[1]
+    
+        frac_h = res_h/float(img_h)
+        frac_w = res_w/float(img_w)
+
+        x_new = (x * frac_w).astype(int)
+        y_new = (y * frac_h).astype(int)
+
+        return x_new, y_new
 
     def __call__(self, clip, bbox=[]):
 
@@ -136,11 +152,16 @@ def __call__(self, clip, bbox=[]):
             out_clip.append(proc_frame)
             if bbox!=[]:
                 temp_bbox = np.zeros(bbox[frame_ind].shape)-1 
-                for class_ind in range(len(bbox[frame_ind])):
-                    if np.array_equal(bbox[frame_ind,class_ind],-1*np.ones(4)): #only annotated objects
+                for class_ind, box in enumerate(bbox[frame_ind]):
+                    if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
                         continue
-                    xmin, ymin, xmax, ymax = bbox[frame_ind, class_ind]
-                    proc_bbox = self.resize_bbox(xmin, ymin, xmax, ymax, frame.shape, (self.size_h, self.size_w))
+
+                    if box.shape[-1] == 2: #Operate on point coordinates
+                        proc_bbox = np.stack(self.resize_pt_coords(box[:,0], box[:,1], frame.shape, (self.size_h, self.size_w)),1)
+                    else: #Operate on bounding box
+                        xmin, ymin, xmax, ymax = box
+                        proc_bbox = self.resize_bbox(xmin, ymin, xmax, ymax, frame.shape, (self.size_h, self.size_w))
+
                     temp_bbox[class_ind,:] = proc_bbox
                 out_bbox.append(temp_bbox)
 
@@ -155,7 +176,7 @@ def __call__(self, clip, bbox=[]):
 
 
 class CropClip(PreprocTransform):
-    def __init__(self, xmin, xmax, ymin, ymax, *args, **kwargs):
+    def __init__(self, xmin=None, xmax=None, ymin=None, ymax=None, *args, **kwargs):
         super(CropClip, self).__init__(*args, **kwargs)
         self.crop_xmin = xmin
         self.crop_xmax = xmax
@@ -165,12 +186,26 @@ def __init__(self, xmin, xmax, ymin, ymax, *args, **kwargs):
         self.crop_h, self.crop_w = kwargs['crop_shape']
 
 
-    def _update_bbox(self, xmin, xmax, ymin, ymax):
+    def _update_bbox(self, xmin, xmax, ymin, ymax, update_crop_shape=False):
+        '''
+            Args:
+                xmin (Float, shape []):
+                xmax (Float, shape []):
+                ymin (Float, shape []):
+                ymax (Float, shape []):
+                update_crop_shape (Boolean): Update expected crop shape along with bbox update call 
+        '''
         self.crop_xmin = xmin
         self.crop_xmax = xmax
         self.crop_ymin = ymin
         self.crop_ymax = ymax
 
+        if update_crop_shape:
+            self.crop_h = ymax - ymin
+            self.crop_w = xmax - xmin 
+
+    def update_crop_shape(self, crop_h, crop_w):
+        pass
 
     def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, crop_ymax):
         if (xmin >= crop_xmax) or (xmax <= crop_xmin) or (ymin >= crop_ymax) or (ymax <= crop_ymin):
@@ -198,8 +233,15 @@ def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, cro
     
         return xmin_new-crop_xmin, ymin_new-crop_ymin, xmax_new-crop_xmin, ymax_new-crop_ymin
 
+    def crop_coords(self, x, y, crop_xmin, crop_ymin, crop_xmax, crop_ymax):
+        if np.any(x >= crop_xmax) or np.any(x <= crop_xmin) or np.any(y >= crop_ymax) or np.any(y <= crop_ymin):
+            return -1*np.ones(x.shape), -1*np.ones(y.shape)
 
-        
+        x_new = np.clip(x, crop_xmin, crop_xmax)
+        y_new = np.clip(x, crop_xmin, crop_xmax)
+
+        return x_new-crop_xmin, y_new-crop_xmin 
+  
     def __call__(self, clip, bbox=[]):
         out_clip = []
         out_bbox = []
@@ -213,11 +255,15 @@ def __call__(self, clip, bbox=[]):
 
             if bbox!=[]:
                 temp_bbox = np.zeros(bbox[frame_ind].shape)-1 
-                for class_ind in range(len(bbox)):
-                    if np.array_equal(bbox[frame_ind,class_ind],-1*np.ones(4)): #only annotated objects
+                for class_ind, box in enumerate(bbox[frame_ind]):
+                    if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
                         continue
-                    xmin, ymin, xmax, ymax = bbox[frame_ind, class_ind]
-                    proc_bbox = self.crop_bbox(xmin, ymin, xmax, ymax, self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax)
+
+                    if box.shape[-1] == 2: #Operate on point coordinates
+                        proc_bbox = np.stack(self.crop_coords(box[:,0], box[:,1], self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax), 1)
+                    else: #Operate on bounding box
+                        xmin, ymin, xmax, ymax = box
+                        proc_bbox = self.crop_bbox(xmin, ymin, xmax, ymax, self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax)
                     temp_bbox[class_ind,:] = proc_bbox
                 out_bbox.append(temp_bbox)
 
@@ -305,6 +351,9 @@ def __init__(self, direction='h', p=0.5, *args, **kwargs):
         super(RandomFlipClip, self).__init__(*args, **kwargs)
         self.direction = direction
         self.p = p
+
+    def _update_p(self, p):
+        self.p = p
             
     def _random_flip(self):
         flip_prob = np.random.random()
@@ -314,17 +363,45 @@ def _random_flip(self):
             return 1
 
     def _h_flip(self, bbox, frame_size):
+        width = frame_size[1]
         bbox_shape = bbox.shape
         output_bbox = np.zeros(bbox_shape)-1
-        for bbox_ind in range(bbox_shape[0]):
-            xmin, ymin, xmax, ymax = bbox[bbox_ind] 
-            width = frame_size[1]
-            xmax_new = width - xmin 
-            xmin_new = width - xmax
-            output_bbox[bbox_ind] = xmin_new, ymin, xmax_new, ymax
-        return output_bbox 
+        for bbox_ind, box in enumerate(bbox):
+            if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
+                continue
+
+            if box.shape[-1] == 2: #Operate on point coordinates
+                x = box[:,0]
+                x_new = width - x
+
+                output_bbox[bbox_ind] = np.stack((x_new,box[:,1]),1)
+            else: #Operate on bounding box
+                xmin, ymin, xmax, ymax = box
+                xmax_new = width - xmin
+                xmin_new = width - xmax
+                output_bbox[bbox_ind] = xmin_new, ymin, xmax_new, ymax
+        return output_bbox
 
     def _v_flip(self, bbox, frame_size):
+        height = frame_size[0]
+        bbox_shape = bbox.shape
+        output_bbox = np.zeros(bbox_shape)-1
+        for bbox_ind, box in enumerate(bbox):
+            if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
+                continue
+
+            if box.shape[-1] == 2: #Operate on point coordinates
+                y = box[:,1]
+                y_new = height - y
+
+                output_bbox[bbox_ind] = np.stack((box[:,0],y_new),1)
+            else: #Operate on bounding box
+                xmin, ymin, xmax, ymax = box
+                ymax_new = height - ymin
+                ymin_new = height - ymax
+                output_bbox[bbox_ind] = xmin, ymin_new, xmax, ymax_new
+        return output_bbox
+
         bbox_shape = bbox.shape
         output_bbox = np.zeros(bbox_shape)-1
         for bbox_ind in range(bbox_shape[0]):

From e3744aba7fecb021eb6150c18f1df961b7913869 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 16 Aug 2019 19:24:56 -0400
Subject: [PATCH 11/55] Add preprocessing test for point coords

---
 datasets/preprocessing_transforms.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 34513c5..7d674a7 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -738,6 +738,14 @@ def resize_test(self):
         exp_bbox = np.array([[[0,0,1,2]]])
         assert (False not in np.isclose(bbox_out, exp_bbox))
 
+        coord_pts = np.array([[[[1,1], [7,5], [9,6]]]]).astype(float)
+        _, bbox_out = self.resize(inp, coord_pts)
+        exp_bbox = np.array([[[[0., 0.],
+                               [3., 3.],
+                               [4., 4.]]]])
+        assert (False not in np.isclose(bbox_out, exp_bbox))
+
+
     def crop_test(self):
         inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float)
         self.crop._update_bbox(1, 3, 1, 3)

From 81045eeef10fa3e975a41612bbd8fc7205b44b0b Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sat, 17 Aug 2019 18:18:49 -0400
Subject: [PATCH 12/55] Add rotations for coordinate points + one small fix

---
 datasets/preprocessing_transforms.py | 44 ++++++++++++++++++++++++----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 7d674a7..f80a7b6 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -204,9 +204,6 @@ def _update_bbox(self, xmin, xmax, ymin, ymax, update_crop_shape=False):
             self.crop_h = ymax - ymin
             self.crop_w = xmax - xmin 
 
-    def update_crop_shape(self, crop_h, crop_w):
-        pass
-
     def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, crop_ymax):
         if (xmin >= crop_xmax) or (xmax <= crop_xmin) or (ymin >= crop_ymax) or (ymax <= crop_ymin):
             return -1, -1, -1, -1
@@ -238,9 +235,9 @@ def crop_coords(self, x, y, crop_xmin, crop_ymin, crop_xmax, crop_ymax):
             return -1*np.ones(x.shape), -1*np.ones(y.shape)
 
         x_new = np.clip(x, crop_xmin, crop_xmax)
-        y_new = np.clip(x, crop_xmin, crop_xmax)
+        y_new = np.clip(y, crop_ymin, crop_ymax)
 
-        return x_new-crop_xmin, y_new-crop_xmin 
+        return x_new-crop_xmin, y_new-crop_ymin 
   
     def __call__(self, clip, bbox=[]):
         out_clip = []
@@ -547,6 +544,38 @@ def _rotate_bbox(self, bboxes, frame_shape, angle):
         return output_bboxes
 
 
+    def _rotate_coords(self, bboxes, frame_shape, angle):
+        angle = np.deg2rad(angle)
+        bboxes_shape = bboxes.shape
+        output_bboxes = np.zeros(bboxes_shape)-1
+        frame_h, frame_w = frame_shape[0], frame_shape[1] 
+        half_h = frame_h/2. 
+        half_w = frame_w/2. 
+
+        for bbox_ind in range(bboxes_shape[0]):
+            x, y = bboxes[bbox_ind].transpose()
+            '''
+            import pdb; pdb.set_trace()
+            x = np.array([1])
+            y = np.array([0])
+            half_w = 1
+            half_h = 1
+            '''
+
+            pts  = (x-half_w, y-half_h)
+
+            pts = self._cart2pol(pts)
+
+            pts = (pts[0], pts[1]-angle)
+
+            pts = self._pol2cart(pts)
+
+            pts  = (pts[0]+half_w, pts[1]+half_h)
+
+            output_bboxes[bbox_ind,:,0] = (np.clip(pts[0], 0, frame_w-1))
+            output_bboxes[bbox_ind,:,1] = (np.clip(pts[1], 0, frame_h-1))
+
+        return output_bboxes
 
     def __call__(self, clip, bbox=[]):
         angle = np.random.choice(self.angles)
@@ -559,7 +588,10 @@ def __call__(self, clip, bbox=[]):
             bbox = np.array(bbox)
             output_bboxes = np.zeros(bbox.shape)-1
             for bbox_ind in range(bbox.shape[0]):
-                output_bboxes[bbox_ind] = self._rotate_bbox(bbox[bbox_ind], clip[0].shape, angle)
+                if bbox.shape[-1] == 2:
+                    output_bboxes[bbox_ind] = self._rotate_coords(bbox[bbox_ind], clip[0].shape, angle)
+                else:
+                    output_bboxes[bbox_ind] = self._rotate_bbox(bbox[bbox_ind], clip[0].shape, angle)
 
             return output_clip, output_bboxes 
 

From 07fb961c96126211a4a1bab7701199299f9ed42a Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 21 Aug 2019 22:17:54 -0400
Subject: [PATCH 13/55] Clean up comments

---
 datasets/preprocessing_transforms.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index f80a7b6..e98a403 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -554,13 +554,6 @@ def _rotate_coords(self, bboxes, frame_shape, angle):
 
         for bbox_ind in range(bboxes_shape[0]):
             x, y = bboxes[bbox_ind].transpose()
-            '''
-            import pdb; pdb.set_trace()
-            x = np.array([1])
-            y = np.array([0])
-            half_w = 1
-            half_h = 1
-            '''
 
             pts  = (x-half_w, y-half_h)
 

From 4a495e9f4f3f44b21c2b06dd6e826ef625981e77 Mon Sep 17 00:00:00 2001
From: Madan <madantrg@umich.edu>
Date: Fri, 23 Aug 2019 14:09:15 -0400
Subject: [PATCH 14/55] Added functionality of resume option after loading
 model, adjusted batch size to match true batch size, numpy seed ajustment and
 learning rate decay

---
 config_default_example.yaml |  3 ++-
 parse_args.py               |  1 +
 train.py                    | 39 ++++++++++++++++++++++++-------------
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/config_default_example.yaml b/config_default_example.yaml
index 6094268..49dc68b 100644
--- a/config_default_example.yaml
+++ b/config_default_example.yaml
@@ -14,7 +14,7 @@ subtract_mean:     ''                    # Subtract mean (R,G,B) from all frames
 
 # Experiment Setup 
 acc_metric:        Accuracy              # Accuracy metric 
-batch_size:        3                     # Numbers of videos in a mini-batch 
+batch_size:        15                     # Numbers of videos in a mini-batch 
 dataset:           HMDB51                # Name of dataset 
 debug:             0                     # If True, do not plot, save, or create data files 
 epoch:             30                    # Total number of epochs 
@@ -37,3 +37,4 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
+resume:            0                     # Flag to resume training or switch to alternate objective after loading
diff --git a/parse_args.py b/parse_args.py
index 9601473..4501622 100644
--- a/parse_args.py
+++ b/parse_args.py
@@ -50,6 +50,7 @@ def __init__(self):
 
         parser.add_argument('--debug',   type=int, help='Run an experiment but do not save any data or create any folders')
         parser.add_argument('--seed',    type=int, help='Seed for reproducibility')
+        parser.add_argument('--resume',  type=int, help='Flag to resume training or switch to alternate objective after loading')
 
         # Default dict, anything not present is required to exist as an argument or in yaml file
         self.defaults = dict(
diff --git a/train.py b/train.py
index 1876c5c..9082c71 100644
--- a/train.py
+++ b/train.py
@@ -106,15 +106,16 @@ def train(**args):
         scheduler  = MultiStepLR(optimizer, milestones=args['milestones'], gamma=args['gamma'])
 
         if isinstance(args['pretrained'], str):
-            ckpt = load_checkpoint(args['pretrained'])
+            ckpt        = load_checkpoint(args['pretrained'])
             model.load_state_dict(ckpt)
-            start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1
-            optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer'))
 
-            for quick_looper in range(start_epoch):
-                scheduler.step()
+            if args['resume']:
+                start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1
 
-            # END FOR
+                optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer'))
+                scheduler.step(epoch=start_epoch)
+
+            # END IF 
 
         else:
             start_epoch = 0
@@ -139,6 +140,7 @@ def train(**args):
             for step, data in enumerate(train_loader):
                 if step% args['pseudo_batch_loop'] == 0:
                     loss = 0.0
+                    running_batch = 0
                     optimizer.zero_grad()
 
                 # END IF
@@ -149,10 +151,11 @@ def train(**args):
                 assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument"
                 outputs = model(x_input)
                 loss    = model_loss.loss(outputs, annotations)
-                loss    = loss * args['batch_size']
+                loss    = loss * outputs.shape[0] 
                 loss.backward()
 
-                running_loss += loss.item()
+                running_loss  += loss.item()
+                running_batch += outputs.shape[0]
 
                 if np.isnan(running_loss):
                     import pdb; pdb.set_trace()
@@ -167,21 +170,24 @@ def train(**args):
                     # END FOR
                 
                     # Add Loss Element
-                    writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/args['batch_size'], epoch*len(train_loader) + step)
+                    writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/outputs.shape[0], epoch*len(train_loader) + step)
 
                 # END IF
 
                 if ((epoch*len(train_loader) + step+1) % 100 == 0):
-                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/args['batch_size']))
+                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/outputs.shape[0]))
 
                 # END IF
 
                 if (epoch * len(train_loader) + (step+1)) % args['pseudo_batch_loop'] == 0 and step > 0:
                     # Apply large mini-batch normalization
                     for param in model.parameters():
-                        param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size'])
-                    optimizer.step()
+                        param.grad *= 1./float(running_batch)
+
+                    # END FOR
 
+                    optimizer.step()
+                    running_batch = 0
 
                 # END IF
     
@@ -201,8 +207,10 @@ def train(**args):
             ## START FOR: Validation Accuracy
             running_acc = []
             running_acc = valid(valid_loader, running_acc, model, device, acc_metric)
+
             if not args['debug']:
-                writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(valid_loader) + step)
+                writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(train_loader) + step)
+
             print('Accuracy of the network on the validation set: %f %%\n' % (100.*running_acc[-1]))
 
             # Save Best Validation Accuracy Model Separately
@@ -246,10 +254,13 @@ def valid(valid_loader, running_acc, model, device, acc_metric):
 
     parse = Parse()
     args = parse.get_args()
+    import pdb; pdb.set_trace()
 
     # For reproducibility
     torch.backends.cudnn.deterministic = True
     torch.manual_seed(args['seed'])
-    #np.random.seed(args['seed']+1)
+
+    if not args['resume']:
+        np.random.seed(args['seed'])
 
     train(**args)

From 95bf52e1332429c6243a7e493344516420ff284d Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 23 Aug 2019 19:54:39 -0400
Subject: [PATCH 15/55] Added affine translation transform on images, bbox, and
 coordinate points

---
 datasets/preprocessing_transforms.py | 77 ++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index e98a403..7dd7414 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -590,7 +590,84 @@ def __call__(self, clip, bbox=[]):
 
         return output_clip
 
+class RandomTranslateClip(PreprocTransform):
+    """
+    Random horizontal and/or vertical shift on frames in a clip
+    Shift will be bounded by object bounding box (if given). Meaning, object will always be in view
+
+    Args:
+        - translate (Tuple)
+            - max_x (float): maximum absolute fraction for horizontal shift 
+            - max_y (float): maximum absolute fraction for vertical shift 
+    """
+    def __init__(self, translate, **kwargs):
+        super(RandomTranslateClip, self).__init__(**kwargs)
+
+        self.max_x, self.max_y = translate
+
+        assert(self.max_x >= 0.0 and self.max_y >= 0.0)
+        assert(self.max_x < 1.0  and self.max_y < 1.0) #Cannot shift pass image bounds
+
+    def _shift_frame(self, bbox, frame, tx, ty):
+        M       = np.array([[1, 0, tx],[0, 1, ty]], dtype=np.float) # 2 x 3 transformation matrix
+        out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0]))
+
+        if bbox is not None:
+            if box.shape[-1] == 2: #Operate on point coordinates
+                bbox_h = np.concatenate((box, np.ones((box.shape[0],1))), axis=1).transpose() #homography coords
+                out_box = M @ bbox_h
+            else: #Operate on bounding box
+                bbox_h = np.reshape(bbox, (-1,2)) #x-y coords
+                bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords
+
+                out_box = M @ bbox_h
+                out_box = np.reshape(out_box.transpose(), (-1,4))
+
+            return out_frame, out_box 
+        else:
+            return out_frame 
+
+    def __call__(self, clip, bbox=[]):
+        out_clip = []
+        clip = self._to_numpy(clip)
+
+        frac_x = np.random.rand()*(2*self.max_x)-self.max_x 
+        frac_y = np.random.rand()*(2*self.max_y)-self.max_y  
+
+        if bbox != []:
+            out_bbox = []
+            
+            for frame, box in zip(clip,bbox):
+                mask = box[:,0] != -1
+                img_h, img_w, _ = frame.shape 
+                tx = int(img_w * frac_x)
+                ty = int(img_h * frac_y) 
+
+                #Bound translation amount so all objects remain in scene
+                if box.shape[-1] == 2: #Operate on point coordinates
+                    tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,0]))
+                    ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,1]))
+                    out_frame, out_box = self._shift_frame(box, frame, tx, ty)
+                    out_box[~mask] = -1*np.ones(2)
+
+                else: #Operate on bounding box 
+                    #bbox is bounding box object
+                    tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,2]))
+                    ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,3]))
+                    out_frame, out_box = self._shift_frame(box, frame, tx, ty)
+                    out_box[~mask] = -1*np.ones(4)
+
+                out_clip.append(out_frame)
+                out_bbox.append(out_box)
+
+            return out_clip, out_bbox 
+        else:
+            for frame in clip:
+                img_h, img_w, _ = frame.shape
+                tx = int(img_w * frac_x)
+                ty = int(img_h * frac_y) 
 
+                out_clip.append(self._shift_frame(None, frame, tx, ty))
 
 class SubtractMeanClip(PreprocTransform):
     def __init__(self, **kwargs):

From 5a2c8024e9eb4ed9eda9e47e63e6c92808f7de5b Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Mon, 26 Aug 2019 19:31:23 -0400
Subject: [PATCH 16/55] Tested on coordinate points

---
 datasets/preprocessing_transforms.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 7dd7414..6eff344 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -613,14 +613,14 @@ def _shift_frame(self, bbox, frame, tx, ty):
         out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0]))
 
         if bbox is not None:
-            if box.shape[-1] == 2: #Operate on point coordinates
-                bbox_h = np.concatenate((box, np.ones((box.shape[0],1))), axis=1).transpose() #homography coords
-                out_box = M @ bbox_h
-            else: #Operate on bounding box
-                bbox_h = np.reshape(bbox, (-1,2)) #x-y coords
-                bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords
+            bbox_h = np.reshape(bbox, (-1,2)) #x-y coords
+            bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords
 
-                out_box = M @ bbox_h
+            out_box = M @ bbox_h
+
+            if bbox.shape[-1] == 2: #Operate on point coordinates
+                out_box = np.reshape(out_box.transpose(), (bbox.shape[0], bbox.shape[1],2))
+            else: #Operate on bounding box
                 out_box = np.reshape(out_box.transpose(), (-1,4))
 
             return out_frame, out_box 
@@ -638,13 +638,13 @@ def __call__(self, clip, bbox=[]):
             out_bbox = []
             
             for frame, box in zip(clip,bbox):
-                mask = box[:,0] != -1
                 img_h, img_w, _ = frame.shape 
                 tx = int(img_w * frac_x)
                 ty = int(img_h * frac_y) 
 
                 #Bound translation amount so all objects remain in scene
                 if box.shape[-1] == 2: #Operate on point coordinates
+                    mask = box[:,:,0] != -1
                     tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,0]))
                     ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,1]))
                     out_frame, out_box = self._shift_frame(box, frame, tx, ty)
@@ -652,6 +652,7 @@ def __call__(self, clip, bbox=[]):
 
                 else: #Operate on bounding box 
                     #bbox is bounding box object
+                    mask = box[:,0] != -1
                     tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,2]))
                     ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,3]))
                     out_frame, out_box = self._shift_frame(box, frame, tx, ty)

From 7547e26967b3b25f1dc4c4adbc7e13ba9260e97a Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Mon, 26 Aug 2019 19:35:44 -0400
Subject: [PATCH 17/55] Convert objects back to numpy after ApplyToPIL

---
 datasets/preprocessing_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 6eff344..23c5196 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -739,7 +739,7 @@ def __call__(self, clip, bbox=[]):
             clip = self._to_pil(clip)
         output_clip = []
         for frame in clip:
-            output_clip.append(self.transform(frame))
+            output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy
 
         if bbox!=[]:
             return output_clip, bbox

From 1b62a43e3253e76801b9bc12491dfdbaa1e72cf0 Mon Sep 17 00:00:00 2001
From: Madan <madantrg@umich.edu>
Date: Thu, 29 Aug 2019 17:31:10 -0400
Subject: [PATCH 18/55] Added extra case for start epoch under resume condition

---
 train.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index 9082c71..9089435 100644
--- a/train.py
+++ b/train.py
@@ -115,6 +115,9 @@ def train(**args):
                 optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer'))
                 scheduler.step(epoch=start_epoch)
 
+            else:
+                start_epoch = 0
+
             # END IF 
 
         else:
@@ -194,6 +197,9 @@ def train(**args):
 
             # END FOR: Epoch
 
+            scheduler.step(epoch=epoch)
+            print('Schedulers lr: %f', scheduler.get_lr()[0])
+
             if not args['debug']:
                 # Save Current Model
                 save_path = os.path.join(save_dir, args['dataset']+'_epoch'+str(epoch)+'.pkl')
@@ -201,9 +207,6 @@ def train(**args):
    
             # END IF: Debug
 
-            scheduler.step(epoch=epoch)
-            print('Schedulers lr: %f', scheduler.get_lr()[0])
-
             ## START FOR: Validation Accuracy
             running_acc = []
             running_acc = valid(valid_loader, running_acc, model, device, acc_metric)
@@ -254,7 +257,6 @@ def valid(valid_loader, running_acc, model, device, acc_metric):
 
     parse = Parse()
     args = parse.get_args()
-    import pdb; pdb.set_trace()
 
     # For reproducibility
     torch.backends.cudnn.deterministic = True

From e3207b0b6c3205e9d63dda10d4643eb964b1b804 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 30 Aug 2019 09:23:56 -0400
Subject: [PATCH 19/55] Corrected a few things for loading testing & validation
 data

---
 datasets/YC2BB.py                  | 37 ++++++++++++++++--------------
 datasets/scripts/gen_json_yc2bb.py |  1 +
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
index 6a54ae0..fb195a3 100644
--- a/datasets/YC2BB.py
+++ b/datasets/YC2BB.py
@@ -103,33 +103,36 @@ def __getitem__(self, idx):
         vid             = base_path.split('/')[-2]
         seg             = base_path.split('/')[-1]
 
-        bbox_data   = np.zeros((self.clip_length, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax]
-        labels      = np.zeros((self.clip_length, self.max_objects))-1
+        bbox_data   = np.zeros((self.clip_length, self.max_objects, 5))-1 #[cls_label, xmin, ymin, xmax ymax]
+        labels      = np.zeros(self.max_objects)-1
 
-        for frame_ind in range(num_frames_1fps):
+        for frame_ind in range(self.clip_length):
             frame      = vid_info['frames'][frame_ind]
             #frame_path = frame['img_path']
-            num_objs        = len(frame['objs'])
+            num_objs    = len(frame['objs'])
             obj_label   = np.zeros((num_objs))-1 #List all unique class ids in entire segment
             
             # Extract bbox and label data from video info
             for obj_ind, obj in enumerate(frame['objs']):
-                label = self.class_dict[obj['c']]
-                obj_label[obj_ind] = label
+                label   = self.class_dict[obj['c']]
+                trackid = obj['trackid']
 
                 if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated
-                    bbox_data[frame_ind, trackid] = [label, -1, -1, -1, -1] 
+                    bbox_data[frame_ind, trackid] = -1*np.ones(5) 
                 else:
-                    trackid   = obj['trackid']
-                    obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
-                    difficult = obj['difficult']
-                    
-                    bbox_data[frame_ind, trackid, :] = [label] + obj_bbox
-                    labels[frame_ind, trackid]       = label 
-                    diff_labels[frame_ind, trackid]  = difficult 
-
-            #input_data.append(cv2.imread(os.path.join(base_path, frame_path), cv2.IMREAD_COLOR)[:,:,(2,1,0)])
-        
+                    if obj['occ'] or obj['outside']:
+                        bbox_data[frame_ind, trackid] = -1*np.ones(5) 
+                    else:   
+                        obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
+                        bbox_data[frame_ind, trackid, :] = [label] + obj_bbox
+
+                obj_label[obj_ind] = label
+                labels[trackid]    = label 
+
+        #Only keep annotations for valid objects
+        bbox_data = bbox_data[:, :num_objs]
+        labels    = labels[:num_objs]
+
         obj_label = torch.from_numpy(obj_label)
         num_frames = num_frames_1fps * 25 #video sampled at 25 fps
         
diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py
index 2d0baf6..4f4b8f2 100644
--- a/datasets/scripts/gen_json_yc2bb.py
+++ b/datasets/scripts/gen_json_yc2bb.py
@@ -52,6 +52,7 @@
                         box    = obj['boxes'][f]
                         
                         if len(box) == 0: #No annotations
+                            objs.append({'trackid':track_id, 'c':cls_name})
                             continue 
 
                         xmin = box['xtl']

From 26eb766239b392477f32be083ebd34d98ed90c87 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 30 Aug 2019 17:34:26 -0400
Subject: [PATCH 20/55] Reproduce exact results as paper

---
 datasets/YC2BB.py                     |  38 ++++--
 metrics.py                            | 122 +++++++++++++++++
 models/dvsa/dvsa.py                   | 147 ++++++++++++++++++++
 models/dvsa/dvsa_utils/transformer.py | 190 ++++++++++++++++++++++++++
 4 files changed, 484 insertions(+), 13 deletions(-)
 create mode 100644 models/dvsa/dvsa.py
 create mode 100644 models/dvsa/dvsa_utils/transformer.py

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
index fb195a3..0f511f8 100644
--- a/datasets/YC2BB.py
+++ b/datasets/YC2BB.py
@@ -28,10 +28,12 @@ def __init__(self, *args, **kwargs):
 
         self.max_objects = 15 
         self.class_dict  = _get_class_labels(class_file)
+        '''
         if self.load_type=='train':
             self.transforms = kwargs['model_obj'].train_transforms
         else:
             self.transforms = kwargs['model_obj'].test_transforms
+        '''
 
         sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type)
 
@@ -93,6 +95,10 @@ def __init__(self, *args, **kwargs):
                split_lst, len(self.sample_lst), len(self.sample_lst)/len(sentences_proc)))
         '''
 
+    #Reverse-mapping between class index to canonical label name
+    def _get_class_labels_reverse(self):
+        return {v:k for k,v in self.class_dict.items()}
+
     def __getitem__(self, idx):
         vid_info = self.samples[idx]
         
@@ -103,10 +109,10 @@ def __getitem__(self, idx):
         vid             = base_path.split('/')[-2]
         seg             = base_path.split('/')[-1]
 
-        bbox_data   = np.zeros((self.clip_length, self.max_objects, 5))-1 #[cls_label, xmin, ymin, xmax ymax]
+        bbox_data   = np.zeros((self.max_objects, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax]
         labels      = np.zeros(self.max_objects)-1
 
-        for frame_ind in range(self.clip_length):
+        for frame_ind in range(num_frames_1fps):
             frame      = vid_info['frames'][frame_ind]
             #frame_path = frame['img_path']
             num_objs    = len(frame['objs'])
@@ -118,22 +124,26 @@ def __getitem__(self, idx):
                 trackid = obj['trackid']
 
                 if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated
-                    bbox_data[frame_ind, trackid] = -1*np.ones(5) 
+                    bbox_data[trackid, frame_ind] = -1*np.ones(5) 
                 else:
                     if obj['occ'] or obj['outside']:
-                        bbox_data[frame_ind, trackid] = -1*np.ones(5) 
+                        bbox_data[trackid, frame_ind] = -1*np.ones(5) 
                     else:   
                         obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
-                        bbox_data[frame_ind, trackid, :] = [label] + obj_bbox
+
+                        #re-order to [ymin, xmin, ymax, xmax], rpn proposals are this way I believe
+                        new_order = [1,0,3,2]
+                        obj_bbox  = [obj_bbox[i] for i in new_order]
+                        bbox_data[trackid, frame_ind, :] = [label] + obj_bbox
 
                 obj_label[obj_ind] = label
                 labels[trackid]    = label 
 
         #Only keep annotations for valid objects
-        bbox_data = bbox_data[:, :num_objs]
+        bbox_data = bbox_data[:num_objs, :]
         labels    = labels[:num_objs]
 
-        obj_label = torch.from_numpy(obj_label)
+        obj_label = torch.from_numpy(obj_label).long()
         num_frames = num_frames_1fps * 25 #video sampled at 25 fps
         
         '''
@@ -182,14 +192,15 @@ def __getitem__(self, idx):
         vis_name = '_-_'.join((self.yc2_split, rec, vid, seg))
 
         ret_dict = dict()
-        ret_dict['data']     = (x_rpn, obj_label) 
+        ret_dict['data']     = [x_rpn, obj_label, self.load_type] 
 
         annot_dict = dict()
-        annot_dict['box']          = bbox_data 
-        annot_dict['box_label']    = labels
-        annot_dict['rpn']          = rpn
-        annot_dict['rpn_original'] = rpn_original 
-        annot_dict['vis_name']     = vis_name
+        annot_dict['box']               = bbox_data 
+        annot_dict['box_label']         = obj_label 
+        annot_dict['rpn']               = rpn
+        annot_dict['rpn_original']      = rpn_original 
+        annot_dict['vis_name']          = vis_name
+        annot_dict['class_labels_dict'] = self._get_class_labels_reverse()
         ret_dict['annots']         = annot_dict
 
         return ret_dict
@@ -231,3 +242,4 @@ def _get_class_labels(class_file):
                     class_dict[row[r]] = int(row[0])
 
     return class_dict
+
diff --git a/metrics.py b/metrics.py
index f4e22b3..0b6b6bd 100644
--- a/metrics.py
+++ b/metrics.py
@@ -21,6 +21,8 @@ def __init__(self, *args, **kwargs):
             self.metric_object = MAP(*args, **kwargs)
         elif self.metric_type == 'SSD_AP':
             self.metric_object = SSD_AP(*args, **kwargs)
+        elif self.metric_type == 'Box_Accuracy':
+            self.metric_object = Box_Accuracy(*args, **kwargs)
         else:
             self.metric_type = None
 
@@ -513,3 +515,123 @@ def get_accuracy(self, detections, data):
 
         return self.get_AP(self.predictions, self.targets) 
 
+class Box_Accuracy():
+    """
+    Box accuracy computation
+
+    """
+    def __init__(self, *args, **kwargs):
+        from collections import defaultdict
+
+        self.thresh = kwargs['accu_thresh']
+        self.fps    = kwargs['fps']
+        self.test_mode = 1 if kwargs['load_type'] == 'test' else 0
+        self.IOU      = IOU()
+        self.ba_score = defaultdict(list) #box accuracy metric
+
+        self.ndata = kwargs['ndata']
+        self.count = 0
+
+    def get_accuracy(self, predictions, data):
+        attn_weights = predictions
+
+        N = attn_weights.shape[0] 
+
+        rpn_batch         = data['rpn_original']
+        box_batch         = data['box']
+        obj_batch         = data['box_label']
+        box_label_batch   = obj_batch 
+        vis_name          = data['vis_name']
+        class_labels_dict = data['class_labels_dict']
+
+        # fps is the frame rate of the attention map
+        # both rpn_batch and box_batch have fps=1
+        _, T_rp, num_proposals, _ = rpn_batch.size()
+        _, O, T_gt, _ = box_batch.size()
+        T_attn = attn_weights.size(2)
+
+        assert(T_rp == T_gt) # both sampled at 1fps
+        #print('# of frames in gt: {}, # of frames in resampled attn. map: {}'.format(T_gt, np.rint(T_attn/self.fps)))
+
+        hits, misses = [0 for o in range(O)], [0 for o in range(O)]
+
+        results = []
+        pos_counter = 0 
+        neg_counter = 0 
+        segment_dict = {} #segment dictionary - to output results to JSON file
+        all_objects = []
+
+        for o in range(O):
+            object_dict = {}
+            if box_label_batch[0, o] not in obj_batch[0, :]: 
+                print('object {} is not grounded!'.format(box_label_batch[0, o]))
+                continue # don't compute score if the object is not grounded
+            obj_ind_in_attn = (obj_batch[0, :] == box_label_batch[0, o]).nonzero().squeeze()
+            if obj_ind_in_attn.numel() > 1:
+                obj_ind_in_attn = obj_ind_in_attn[0]
+            else:
+                obj_ind_in_attn = obj_ind_in_attn.item()
+
+            new_attn_weights = attn_weights[0, obj_ind_in_attn]
+            _, max_attn_ind = torch.max(new_attn_weights, dim=1)
+
+            # uncomment this for the random baseline
+            # max_attn_ind = torch.floor(torch.rand(T_attn)*num_proposals).long()
+            label = class_labels_dict[box_label_batch[0,o].item()]
+            object_dict = {'label':label}
+        
+            boxes = []
+            for t in range(T_gt):
+                if box_batch[0,o,t,0] == -1: # object is outside/non-exist/occlusion
+                    boxes.append({'xtl':-1, 'ytl':-1, 'xbr':-1, 'ybr':-1, 'outside':1, 'occluded':1}) #object is either occluded or outside of frame 
+                    neg_counter += 1
+                    continue
+                pos_counter += 1
+                box_ind = max_attn_ind[int(min(np.rint(t*self.fps), T_attn-1))]
+                box_coord = rpn_batch[0, t, box_ind, :].view(4) # x_tl, y_tl, x_br, y_br
+                gt_box = box_batch[0,o,t][torch.Tensor([2,1,4,3]).type(box_batch.type()).long()].view(1,4) # inverse x and y
+
+                if self.IOU.get_accuracy(box_coord, gt_box.float())[0].item() > self.thresh:
+                    hits[o] += 1
+                else:
+                    misses[o] += 1
+
+                xtl = box_coord[0].item()
+                ytl = box_coord[1].item()
+                xbr = box_coord[2].item()
+                ybr = box_coord[3].item()
+                boxes.append({'xtl':xtl, 'ytl':ytl, 'xbr':xbr, 'ybr':ybr, 'outside':0, 'occluded':0}) 
+
+            object_dict['boxes'] = boxes
+            all_objects.append(object_dict)
+
+            results.append((box_label_batch[0, o].item(), hits[o], misses[o]))
+
+        segment_dict['objects'] = all_objects
+        #print('percentage of frames with box: {}'.format(pos_counter/(pos_counter+neg_counter)))
+        
+        for (i,h,m) in results:
+            self.ba_score[i].append((h,m))
+        
+
+        self.count += N
+        if self.count < self.ndata:
+            return -1
+        
+        if self.test_mode: #Annotations for the testing split are not publicly available
+            return -1
+
+        ba_final = []
+        for k, r in self.ba_score.items():
+            cur_hit = 0 
+            cur_miss = 0 
+            for v in r:
+                cur_hit += v[0]
+                cur_miss += v[1]
+
+            if cur_hit+cur_miss != 0:
+                #print('BA for {}(...): {:.4f}'.format(k, cur_hit/(cur_hit+cur_miss)))
+                ba_final.append(cur_hit/(cur_hit+cur_miss))
+
+        #print('The overall BA is: {:.4f}'.format(np.mean(ba_final))) 
+        return np.mean(ba_final)
diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py
new file mode 100644
index 0000000..0543036
--- /dev/null
+++ b/models/dvsa/dvsa.py
@@ -0,0 +1,147 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import math
+import numpy as np
+from functools import partial
+import os
+
+from models.dvsa.dvsa_utils.transformer import Transformer
+
+class DVSA(nn.Module):
+
+#    def __init__(self, num_class, input_size=2048, enc_size=128, dropout=0.2, hidden_size=256, n_layers=1, n_heads=4, attn_drop=0.2, num_frm=5, has_loss_weighting=False):
+    def __init__(self, **kwargs):
+        super().__init__()
+        num_class          = kwargs['num_class']
+        input_size         = kwargs['input_size']
+        enc_size           = kwargs['enc_size']
+        dropout            = kwargs['dropout']
+        hidden_size        = kwargs['hidden_size']
+        n_layers           = kwargs['n_layers']
+        n_heads            = kwargs['n_heads']
+        attn_drop          = kwargs['attn_drop']
+        num_frm            = kwargs['num_frm']
+        has_loss_weighting = kwargs['has_loss_weighting']
+        
+        # encode the region feature
+        self.feat_enc = nn.Sequential(
+            nn.Linear(input_size, enc_size),
+            nn.Dropout(p=dropout),
+            nn.ReLU()
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+        # lookup table for object label embedding
+        self.obj_emb = nn.Embedding(num_class+1, enc_size) # +1 for the dummy paddings
+        self.num_class = num_class
+
+        self.obj_interact = Transformer(enc_size, 0, 0,
+            d_hidden=hidden_size,
+            n_layers=n_layers,
+            n_heads=n_heads,
+            drop_ratio=attn_drop)
+
+        self.obj_interact_fc = nn.Sequential(
+            nn.Linear(enc_size*2, int(enc_size/2)),
+            nn.ReLU(),
+            nn.Linear(int(enc_size/2), 5), # object interaction guidance (always 5 snippets)
+            nn.Sigmoid()
+        )
+
+        self.num_frm = num_frm 
+        self.has_loss_weighting = has_loss_weighting
+
+        if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']:
+            self._load_pretrained_weights()
+
+    def forward(self, x_o, obj, load_type):
+        is_evaluate = 1 if load_type[0] == 'test' or load_type[0] == 'val' else 0
+        if is_evaluate:
+            return self.output_attn(x_o, obj)
+
+        x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous()
+
+        x_o = torch.stack([x_o[0], x_o[1], x_o[0]])
+        obj = torch.stack([obj[0], obj[0], obj[1]])
+
+        N, C_out, T, num_proposals = x_o.size()
+        assert(N == 3) # two pos samples and one neg sample
+
+        # attention
+        O = obj.size(1)
+        attn_key = self.obj_emb(obj)
+
+        num_pos_obj = torch.sum(obj[0]<self.num_class).long().item()
+        num_neg_obj = torch.sum(obj[2]<self.num_class).long().item()
+        # object interaction guidance
+        attn_key_frm_feat = attn_key[0:1, :num_pos_obj] # cat visual feature
+        obj_attn_emb,_ = self.obj_interact(attn_key_frm_feat)
+        obj_attn_emb = obj_attn_emb[:, :num_pos_obj, :]
+        obj_attn_emb = torch.cat((obj_attn_emb, attn_key[0:1, :num_pos_obj], ), dim=2)
+        obj_attn_emb = self.obj_interact_fc(obj_attn_emb) # N, O, 5
+        
+        itv = math.ceil(T/5)
+        tmp = [] # expand obj_attn_emb to N, O, T
+        for i in range(5):
+            l = min(itv*(i+1), T)-itv*i
+            if l>0:
+                tmp.append(obj_attn_emb[:, :, i:(i+1)].expand(1, num_pos_obj, l))
+        obj_attn_emb = torch.cat(tmp, 2).squeeze(0)
+        assert(obj_attn_emb.size(1) == self.num_frm)
+
+        loss_weigh = torch.mean(obj_attn_emb, dim=0)
+        loss_weigh = torch.cat((loss_weigh, loss_weigh)).unsqueeze(1)
+
+        if self.has_loss_weighting:
+            # dot-product attention
+            x_o = x_o.view(N, 1, C_out, T, num_proposals)
+            attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1, 1)).sum(2)/math.sqrt(C_out))
+
+            pos_weights = attn_weights[0, :num_pos_obj, :, :]
+            neg1_weights = attn_weights[1, :num_pos_obj, :, :]
+            neg2_weights = attn_weights[2, :num_neg_obj, :, :]
+
+            return torch.cat((torch.stack((torch.mean(torch.max(pos_weights, dim=2)[0], dim=0), torch.mean(torch.max(neg1_weights, dim=2)[0], dim=0)), dim=1),
+                torch.stack((torch.mean(torch.max(pos_weights, dim=2)[0], dim=0), torch.mean(torch.max(neg2_weights, dim=2)[0], dim=0)), dim=1))), loss_weigh
+        else:
+            # dot-product attention
+            x_o = x_o.view(N, 1, C_out, T*num_proposals)
+            attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1)).sum(2)/math.sqrt(C_out))
+
+            pos_weights = attn_weights[0, :num_pos_obj, :]
+            neg1_weights = attn_weights[1, :num_pos_obj, :]
+            neg2_weights = attn_weights[2, :num_neg_obj, :]
+
+            return torch.stack((torch.stack((torch.mean(torch.max(pos_weights, dim=1)[0]), torch.mean(torch.max(neg1_weights, dim=1)[0]))),
+                torch.stack((torch.mean(torch.max(pos_weights, dim=1)[0]), torch.mean(torch.max(neg2_weights, dim=1)[0]))))), loss_weigh
+
+    def output_attn(self, x_o, obj):
+        x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous()
+
+        N, C_out, T, num_proposals = x_o.size()
+        assert(N == 1) # two pos samples and one neg sample
+
+        # attention
+        O = obj.size(1)
+        attn_key = self.obj_emb(obj)
+
+        # dot-product attention
+        x_o = x_o.view(N, 1, C_out, T*num_proposals)
+        attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1)).sum(2)/math.sqrt(C_out))
+        # attn_weights = self.sigmoid((x_e*attn_key.view(N, O, C_out, 1).expand(N, O, C_out, T*num_proposals)).sum(2)) # N, O, T, H*W
+
+        # additive attention
+        # x_e = x_o.view(N, 1, C_out, T, H*W).contiguous().expand(N, O, C_out, T, H*W)
+        # attn_e = attn_key.view(N, O, C_out, 1, 1).expand(N, O, C_out, T, H*W)
+        # attn_weights = self.attn_mlp(torch.cat((x_e, attn_e), dim=2).permute(0,1,3,4,2).contiguous()).squeeze(4) # N, O, T, H*W
+
+        return attn_weights.view(N, O, T, num_proposals)
+
+    def _load_pretrained_weights(self):
+        state_dict = torch.load('weights/yc2bb_full-model.pth', map_location=lambda storage, location: storage)
+
+        self.load_state_dict(state_dict)
+
diff --git a/models/dvsa/dvsa_utils/transformer.py b/models/dvsa/dvsa_utils/transformer.py
new file mode 100644
index 0000000..38f9e3a
--- /dev/null
+++ b/models/dvsa/dvsa_utils/transformer.py
@@ -0,0 +1,190 @@
+# Originally from https://github.com/salesforce/densecap
+"""
+ Copyright (c) 2018, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+# Last modified by Luowei Zhou on 07/01/2018
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd import Variable
+
+import random
+import string
+import sys
+import math
+import uuid
+import numpy as np
+
+INF = 1e10
+
+def positional_encodings_like(x, t=None):
+    if t is None:
+        positions = torch.arange(0, x.size(1))
+        if x.is_cuda:
+           positions = positions.cuda(x.get_device())
+    else:
+        positions = t
+    encodings = x.new(*x.size()[1:]).fill_(0)
+    if x.is_cuda:
+        encodings = encodings.cuda(x.get_device())
+
+
+    for channel in range(x.size(-1)):
+        if channel % 2 == 0:
+            encodings[:, channel] = torch.sin(
+                positions.float() / 10000 ** (channel / x.size(2)))
+        else:
+            encodings[:, channel] = torch.cos(
+                positions.float() / 10000 ** ((channel - 1) / x.size(2)))
+    return Variable(encodings)
+
+class Linear(nn.Linear):
+
+    def forward(self, x):
+        size = x.size()
+        return super().forward(
+            x.contiguous().view(-1, size[-1])).view(*size[:-1], -1)
+
+# F.softmax has strange default behavior, normalizing over dim 0 for 3D inputs
+# deprecated since PyTorch 0.3
+# def softmax(x):
+#     if x.dim() == 3:
+#         return F.softmax(x.transpose(0, 2)).transpose(0, 2)
+#     return F.softmax(x)
+
+# torch.matmul can't do (4, 3, 2) @ (4, 2) -> (4, 3)
+def matmul(x, y):
+    if x.dim() == y.dim():
+        return x @ y
+    if x.dim() == y.dim() - 1:
+        return (x.unsqueeze(-2) @ y).squeeze(-2)
+    return (x @ y.unsqueeze(-2)).squeeze(-2)
+
+class LayerNorm(nn.Module):
+
+    def __init__(self, d_model, eps=1e-6):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(d_model))
+        self.beta = nn.Parameter(torch.zeros(d_model))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.gamma * (x - mean) / (std + self.eps) + self.beta
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, layer, d_model, drop_ratio):
+        super().__init__()
+        self.layer = layer
+        self.dropout = nn.Dropout(drop_ratio)
+        self.layernorm = LayerNorm(d_model)
+
+    def forward(self, *x):
+        return self.layernorm(x[0] + self.dropout(self.layer(*x)))
+
+class Attention(nn.Module):
+
+    def __init__(self, d_key, drop_ratio, causal):
+        super().__init__()
+        self.scale = math.sqrt(d_key)
+        self.dropout = nn.Dropout(drop_ratio)
+        self.causal = causal
+
+    def forward(self, query, key, value):
+        dot_products = matmul(query, key.transpose(1, 2))
+        if query.dim() == 3 and (self is None or self.causal):
+            tri = torch.ones(key.size(1), key.size(1)).triu(1) * INF
+            if key.is_cuda:
+                tri = tri.cuda(key.get_device())
+            dot_products.data.sub_(tri.unsqueeze(0))
+        return matmul(self.dropout(F.softmax(dot_products / self.scale, dim=2)), value)
+
+class MultiHead(nn.Module):
+
+    def __init__(self, d_key, d_value, n_heads, drop_ratio, causal=False):
+        super().__init__()
+        self.attention = Attention(d_key, drop_ratio, causal=causal)
+        self.wq = Linear(d_key, d_key, bias=False)
+        self.wk = Linear(d_key, d_key, bias=False)
+        self.wv = Linear(d_value, d_value, bias=False)
+        self.wo = Linear(d_value, d_key, bias=False)
+        self.n_heads = n_heads
+
+    def forward(self, query, key, value):
+        query, key, value = self.wq(query), self.wk(key), self.wv(value)
+        query, key, value = (
+            x.chunk(self.n_heads, -1) for x in (query, key, value))
+        return self.wo(torch.cat([self.attention(q, k, v)
+                          for q, k, v in zip(query, key, value)], -1))
+
+class FeedForward(nn.Module):
+
+    def __init__(self, d_model, d_hidden):
+        super().__init__()
+        self.linear1 = Linear(d_model, d_hidden)
+        self.linear2 = Linear(d_hidden, d_model)
+
+    def forward(self, x):
+        return self.linear2(F.relu(self.linear1(x)))
+
+class EncoderLayer(nn.Module):
+
+    def __init__(self, d_model, d_hidden, n_heads, drop_ratio):
+        super().__init__()
+        self.selfattn = ResidualBlock(
+            MultiHead(d_model, d_model, n_heads, drop_ratio),
+            d_model, drop_ratio)
+        self.feedforward = ResidualBlock(FeedForward(d_model, d_hidden),
+                                         d_model, drop_ratio)
+
+    def forward(self, x):
+        return self.feedforward(self.selfattn(x, x, x))
+
+class Encoder(nn.Module):
+
+    def __init__(self, d_model, d_hidden, n_vocab, n_layers, n_heads,
+                 drop_ratio):
+        super().__init__()
+        # self.linear = nn.Linear(d_model*2, d_model)
+        self.layers = nn.ModuleList(
+            [EncoderLayer(d_model, d_hidden, n_heads, drop_ratio)
+             for i in range(n_layers)])
+        self.dropout = nn.Dropout(drop_ratio)
+
+    def forward(self, x, mask=None):
+        # x = self.linear(x)
+        x = x+positional_encodings_like(x)
+        x = self.dropout(x)
+        if mask is not None:
+            x = x*mask
+        encoding = []
+        for layer in self.layers:
+            x = layer(x)
+            if mask is not None:
+                x = x*mask
+            encoding.append(x)
+        return encoding
+
+class Transformer(nn.Module):
+
+    def __init__(self, d_model, n_vocab_src, vocab_trg, d_hidden=2048,
+                 n_layers=6, n_heads=8, drop_ratio=0.1):
+        super().__init__()
+        self.encoder = Encoder(d_model, d_hidden, n_vocab_src, n_layers,
+                               n_heads, drop_ratio)
+
+    def denum(self, data):
+        return ' '.join(self.decoder.vocab.itos[i] for i in data).replace(
+            ' <eos>', '#').replace(' <pad>', '')
+
+    def forward(self, x):
+        encoding = self.encoder(x)
+
+        return encoding[-1], encoding
+

From 7d9c2e2a8fbd1996079cf408a1fef686a13a38d9 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sat, 31 Aug 2019 14:52:42 -0400
Subject: [PATCH 21/55] Big change in eval, plus some comments

---
 eval.py    |  8 +++++---
 metrics.py | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/eval.py b/eval.py
index 2d5ed2e..4ea504b 100644
--- a/eval.py
+++ b/eval.py
@@ -90,10 +90,12 @@ def eval(**args):
 
     with torch.no_grad():
         for step, data in enumerate(eval_loader):
-            x_input     = data['data'].to(device)
+            x_input     = data['data']
+            for i, item in enumerate(x_input):
+                if isinstance(item, torch.Tensor):
+                    x_input[i] = item.to(device)
             annotations = data['annots']
-
-            outputs = model(x_input)
+            outputs = model(*x_input)
 
             acc = acc_metric.get_accuracy(outputs, annotations)
 
diff --git a/metrics.py b/metrics.py
index 0b6b6bd..1f4daf9 100644
--- a/metrics.py
+++ b/metrics.py
@@ -161,8 +161,8 @@ def __init__(self, threshold=0.5, num_points=101, *args, **kwargs):
         """
         Compute Average Precision (AP)
         Args:
-            threshold  (scalar): iou threshold 
-            num_points (scalar): number of points to average for the interpolated AP calculation
+            threshold  (float): iou threshold 
+            num_points (int): number of points to average for the interpolated AP calculation
 
         Return:
             None 
@@ -188,10 +188,10 @@ def compute_class_ap(self, tp, fp, npos):
         Args:
             tp   (Tensor, shape [N*D]): cumulative sum of true positive detections 
             fp   (Tensor, shape [N*D]): cumulative sum of false positive detections 
-            npos (Tensor, scalar): actual positives (from ground truth)
+            npos (Tensor, int): actual positives (from ground truth)
 
         Return:
-            ap (Tensor, scalar): average precision calculation
+            ap (Tensor, float): average precision calculation
         """
         
         #Values for precision-recall curve
@@ -220,7 +220,7 @@ def get_AP(self, predictions, targets):
             D_: ground truth detections 
 
         Return:
-            avg_ap (Tensor, scalar): mean ap across all classes 
+            avg_ap (Tensor, float): mean ap across all classes 
         """
 
         N,C,D,_ = predictions.shape
@@ -331,7 +331,7 @@ def __init__(self, threshold=torch.linspace(0.5,0.95,10), num_points=101, *args,
 
         Args:
             threshold  (Tensor, shape[10]): Calculate AP at each of these threshold values
-            num_points (scalar): number of points to average for the interpolated AP calculation
+            num_points (float): number of points to average for the interpolated AP calculation
         """
 
         self.threshold = threshold
@@ -413,7 +413,7 @@ def __init__(self, threshold=0.5, det=None, *args, **kwargs):
         Compute Average Recall (AR)
 
         Args:
-            threshold: (scalar)
+            threshold: (float)
             det: max number of detections per image (optional)
         """
         
@@ -459,11 +459,11 @@ def __init__(self, threshold=0.5, num_points=11, *args, **kwargs):
         """
         Compute Average Precision (AP)
         Args:
-            threshold    (scalar): iou threshold 
-            num_points   (scalar): number of points to average for the interpolated AP calculation
+            threshold    (float): iou threshold 
+            num_points   (int): number of points to average for the interpolated AP calculation
             final_shape  (list) : [height, width] of input given to CNN
             result_dir   (String): save detections to this location
-            ndata        (scalar): total number of datapoints in dataset 
+            ndata        (int): total number of datapoints in dataset 
 
         Return:
             None 
@@ -517,7 +517,14 @@ def get_accuracy(self, detections, data):
 
 class Box_Accuracy():
     """
-    Box accuracy computation
+    Box accuracy computation for YC2-BB model.
+    Adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/tools/test_util.py 
+
+    Args:
+        accu_thres: (float)  iou threshold
+        fps:        (int)    frames per second video annotations were sampled at
+        load_type:  (String) data split, only validation has publicly available annotations
+        ndata       (int):   total number of datapoints in dataset 
 
     """
     def __init__(self, *args, **kwargs):
@@ -533,6 +540,20 @@ def __init__(self, *args, **kwargs):
         self.count = 0
 
     def get_accuracy(self, predictions, data):
+        """
+        Args:
+            predictions: (Tensor, shape [N,W,T,D]), attention weight output from model
+            data:      (dictionary)
+                - rpn_original      (Tensor, shape [N,T,D,4]) 
+                - box               (Tensor, shape [N,T,D,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) 
+                - box_label         (Tensor, shape [N,W]) 
+                - vis_name          (List, shape [N]), unique segment identifier  
+                - class_labels_dict (dict, length 67) class index to class label mapping 
+
+            W: unique word in segment (from YC2BB class dictionary)
+        Return:
+           Box accuracy score  
+        """
         attn_weights = predictions
 
         N = attn_weights.shape[0] 

From e71ed17283e1d28240c37dbb95dff4582b223447 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Wed, 4 Sep 2019 14:08:00 -0400
Subject: [PATCH 22/55] Updated ApplyToPIL and added TranslateClip tests

---
 datasets/preprocessing_transforms.py | 52 +++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 23c5196..6ec22b2 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -594,6 +594,7 @@ class RandomTranslateClip(PreprocTransform):
     """
     Random horizontal and/or vertical shift on frames in a clip
     Shift will be bounded by object bounding box (if given). Meaning, object will always be in view
+    Input numpy array must be of type np.uint8
 
     Args:
         - translate (Tuple)
@@ -606,7 +607,7 @@ def __init__(self, translate, **kwargs):
         self.max_x, self.max_y = translate
 
         assert(self.max_x >= 0.0 and self.max_y >= 0.0)
-        assert(self.max_x < 1.0  and self.max_y < 1.0) #Cannot shift pass image bounds
+        assert(self.max_x < 1.0  and self.max_y < 1.0) #Cannot shift past image bounds
 
     def _shift_frame(self, bbox, frame, tx, ty):
         M       = np.array([[1, 0, tx],[0, 1, ty]], dtype=np.float) # 2 x 3 transformation matrix
@@ -638,7 +639,7 @@ def __call__(self, clip, bbox=[]):
             out_bbox = []
             
             for frame, box in zip(clip,bbox):
-                img_h, img_w, _ = frame.shape 
+                img_h, img_w = frame.shape[:2] 
                 tx = int(img_w * frac_x)
                 ty = int(img_h * frac_y) 
 
@@ -664,11 +665,12 @@ def __call__(self, clip, bbox=[]):
             return out_clip, out_bbox 
         else:
             for frame in clip:
-                img_h, img_w, _ = frame.shape
+                img_h, img_w = frame.shape[:2] 
                 tx = int(img_w * frac_x)
                 ty = int(img_h * frac_y) 
 
                 out_clip.append(self._shift_frame(None, frame, tx, ty))
+            return out_clip
 
 class SubtractMeanClip(PreprocTransform):
     def __init__(self, **kwargs):
@@ -735,11 +737,31 @@ def __init__(self, **kwargs):
         self.transform = kwargs['transform'](**self.class_kwargs)
 
     def __call__(self, clip, bbox=[]):
+        input_pil = True
+        output_clip = []
+
         if not isinstance(clip[0], Image.Image):
             clip = self._to_pil(clip)
-        output_clip = []
-        for frame in clip:
-            output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy
+            clip = [frame.convert('RGB') for frame in clip]
+            input_pil = False
+
+        if input_pil:
+            for frame in clip:
+                transformed_frame = self.transform(frame)
+                if isinstance(transformed_frame, tuple) or isinstance(transformed_frame, list):
+                    for tf in transformed_frame:
+                        output_clip.append(tf)
+                else:
+                    output_clip.append(self.transform(frame)) #Apply transform and convert back to Numpy
+
+        else:
+            for frame in clip:
+                transformed_frame = self.transform(frame)
+                if isinstance(transformed_frame, tuple) or isinstance(transformed_frame, list):
+                    for tf in transformed_frame:
+                        output_clip.append(np.array(tf))
+                else:
+                    output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy
 
         if bbox!=[]:
             return output_clip, bbox
@@ -820,6 +842,7 @@ def __init__(self):
         self.rand_flip_h = RandomFlipClip(direction='h', p=1.0)
         self.rand_flip_v = RandomFlipClip(direction='v', p=1.0)
         self.rand_rot = RandomRotateClip(angles=[90])
+        self.rand_trans = RandomTranslateClip(translate=(0.5,0.5))
         self.sub_mean = SubtractMeanClip(clip_mean=np.zeros(1))
         self.applypil = ApplyToPIL(transform=torchvision.transforms.ColorJitter, class_kwargs=dict(brightness=1))
         self.applypil2 = ApplyToPIL(transform=torchvision.transforms.FiveCrop, class_kwargs=dict(size=(64,64)))
@@ -916,6 +939,14 @@ def rand_rot_test(self):
         out_bbox = self.rand_rot([inp2], np.array([bbox]))[1][0].tolist()
         assert (False not in np.isclose(out, exp_out)) and (False not in np.isclose(exp_bbox, out_bbox))
 
+
+    def rand_trans_test(self):
+        x = np.arange(112*112).reshape(112,112).astype(np.uint8)
+        out = self.rand_trans([x])
+        out2 = self.rand_trans([x], bbox=[np.array([[32,32,96,96]])])
+
+        assert (out2[1][0].min() >= 0) and (out[0].shape==(112,112)) and (out2[0][0].shape==(112,112))
+
     def rand_rot_vis(self):
         import matplotlib.pyplot as plt
         self.rand_rot._update_angles([20])
@@ -942,11 +973,13 @@ def rand_rot_vis(self):
 
     def applypil_test(self):
         inp = np.arange(112*112).reshape(112,112)
+        np_inp = [inp, inp]
         inp = self.applypil._to_pil([inp, inp])
         inp = [inp[0].convert('RGB'), inp[1].convert('RGB')]
-        out1 = self.applypil(inp)
-        out = self.applypil2(out1)
-        assert (len(out)==2) and (len(out[0])==5) and (out[0][0].size==(64,64)) and (isinstance(out[0][0], Image.Image))
+        out = self.applypil(inp)
+        out2 = self.applypil2(out)
+        out3 = self.applypil(np_inp)
+        assert (len(out2)==2*5) and (out2[0].size==(64,64)) and (isinstance(out2[0], Image.Image)) and (isinstance(out3[0], np.ndarray))
 
     def applytensor_test(self):
         inp = np.arange(112*112*3).reshape(3,112,112).astype('float32')
@@ -996,6 +1029,7 @@ def run_tests(self):
         self.rand_crop_test()
         self.rand_flip_test()
         self.rand_rot_test()
+        self.rand_trans_test()
         self.applypil_test()
         self.applytensor_test()
         self.applycv_test()

From b3e943538328812b807328268ca651ff847130d9 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 5 Sep 2019 10:53:25 -0400
Subject: [PATCH 23/55] Add scaling on clips - need to select anchor for
 scaling

---
 datasets/preprocessing_transforms.py | 95 +++++++++++++++++++++++++++-
 1 file changed, 94 insertions(+), 1 deletion(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 23c5196..b85557c 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -592,7 +592,7 @@ def __call__(self, clip, bbox=[]):
 
 class RandomTranslateClip(PreprocTransform):
     """
-    Random horizontal and/or vertical shift on frames in a clip
+    Random horizontal and/or vertical shift on frames in a clip. All frames receive same shifting 
     Shift will be bounded by object bounding box (if given). Meaning, object will always be in view
 
     Args:
@@ -669,6 +669,99 @@ def __call__(self, clip, bbox=[]):
                 ty = int(img_h * frac_y) 
 
                 out_clip.append(self._shift_frame(None, frame, tx, ty))
+            return out_clip 
+
+class RandomScaleClip(PreprocTransform):
+    """
+    Random scaling on all frames in a clip. All frames receive same scaling
+    Shift will be bounded by object bounding box (if given). Meaning, object will always be in view
+
+    Args:
+        - scale (Tuple)
+            - min_scale (float): minimum scaling on frame  
+            - max_scale (float): maximum scaling on frame  
+    """
+    def __init__(self, scale, **kwargs):
+        super(RandomScaleClip, self).__init__(**kwargs)
+
+        self.min_scale, self.max_scale = scale
+
+        assert(self.min_scale <= self.max_scale)
+
+    def _scale_frame(self, bbox, frame, sx, sy):
+        M = np.array([[sx, 0, 1],[0, sy, 1]], dtype=np.float) # 2 x 3 transformation matrix
+        out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0]))
+
+        if bbox is not None:
+            bbox_h = np.reshape(bbox, (-1,2)) #x-y coords
+            bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords
+
+            out_box = M @ bbox_h
+
+            if bbox.shape[-1] == 2: #Operate on point coordinates
+                out_box = np.reshape(out_box.transpose(), (bbox.shape[0], bbox.shape[1],2))
+            else: #Operate on bounding box
+                out_box = np.reshape(out_box.transpose(), (-1,4))
+
+            return out_frame, out_box 
+        else:
+            return out_frame 
+
+    def __call__(self, clip, bbox=[]):
+        out_clip = []
+        clip = self._to_numpy(clip)
+
+        sc = np.random.uniform(self.min_scale, self.max_scale) 
+        print('Randomly selected scale: {}'.format(sc))
+
+        if bbox != []:
+            out_bbox = []
+            
+            for frame, box in zip(clip,bbox):
+                img_h, img_w, _ = frame.shape 
+                sx = np.ceil(img_w * sc)
+                sy = np.ceil(img_h * sc)
+
+                #Bound scaling so all objects remain in scene
+                if box.shape[-1] == 2: #Operate on point coordinates
+                    mask = box[:,:,0] != -1
+                    sx = min(img_w, np.max(sc*box[mask,0]))
+                    sy = min(img_h, np.max(sc*box[mask,1]))
+
+                    if sx == img_w or sy == img_h:
+                        sc = min(sx/np.max(box[mask,0]), sy/np.max(box[mask,1]))
+                    else:
+                        sc = min(sx/np.min(box[mask,0]), sy/np.min(box[mask,1]))
+
+                    out_frame, out_box = self._scale_frame(box, frame, sc, sc)
+                    out_box[~mask] = -1*np.ones(2)
+
+                else: #Operate on bounding box 
+                    #bbox is bounding box object
+                    mask = box[:,0] != -1
+                    sx = min(img_w, np.max(sc*box[mask,2]))
+                    sy = min(img_h, np.max(sc*box[mask,3]))
+                    
+                    if sx == img_w or sy == img_h:
+                        sc = min(sx/np.max(box[mask,2]), sy/np.max(box[mask,3]))
+                    else:
+                        sc = min(sx/np.min(box[mask,2]), sy/np.min(box[mask,3]))
+
+                    out_frame, out_box = self._scale_frame(box, frame, sc, sc)
+                    out_box[~mask] = -1*np.ones(4)
+
+                out_clip.append(out_frame)
+                out_bbox.append(out_box)
+
+            return out_clip, out_bbox 
+        else:
+            for frame in clip:
+                img_h, img_w, _ = frame.shape
+                sx = int(img_w * sc)
+                sy = int(img_h * sc) 
+
+                out_clip.append(self._scale_frame(None, frame, sc, sc))
+            return out_clip 
 
 class SubtractMeanClip(PreprocTransform):
     def __init__(self, **kwargs):

From d5b01037857d04ac545056197d3341ec8d868cc0 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 5 Sep 2019 17:39:14 -0400
Subject: [PATCH 24/55] Anchor scaling to center of image, changed name to
 RandomZoomClip

---
 datasets/preprocessing_transforms.py | 71 ++++++++++++++++------------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index e76e76f..44d8542 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -673,14 +673,14 @@ def __call__(self, clip, bbox=[]):
 
             return out_clip 
 
-class RandomScaleClip(PreprocTransform):
+class RandomZoomClip(PreprocTransform):
     """
-    Random scaling on all frames in a clip. All frames receive same scaling
-    Shift will be bounded by object bounding box (if given). Meaning, object will always be in view
+    Random zoom on all frames in a clip. All frames receive same scaling
+    Scale will be bounded by object bounding box (if given). Meaning, object will always be in view
 
     Args:
         - scale (Tuple)
-            - min_scale (float): minimum scaling on frame  
+            - min_scale (float): minimum scaling on frame 
             - max_scale (float): maximum scaling on frame  
     """
     def __init__(self, scale, **kwargs):
@@ -688,10 +688,10 @@ def __init__(self, scale, **kwargs):
 
         self.min_scale, self.max_scale = scale
 
-        assert(self.min_scale <= self.max_scale)
+        assert(self.min_scale > 0 and self.min_scale <= self.max_scale)
 
-    def _scale_frame(self, bbox, frame, sx, sy):
-        M = np.array([[sx, 0, 1],[0, sy, 1]], dtype=np.float) # 2 x 3 transformation matrix
+    def _scale_frame(self, bbox, frame, sc):
+        M = cv2.getRotationMatrix2D((frame.shape[1]/2, frame.shape[0]/2), 0, sc) # 2 x 3 rotation matrix
         out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0]))
 
         if bbox is not None:
@@ -714,42 +714,51 @@ def __call__(self, clip, bbox=[]):
         clip = self._to_numpy(clip)
 
         sc = np.random.uniform(self.min_scale, self.max_scale) 
-        print('Randomly selected scale: {}'.format(sc))
 
         if bbox != []:
             out_bbox = []
             
             for frame, box in zip(clip,bbox):
-                img_h, img_w, _ = frame.shape 
-                sx = np.ceil(img_w * sc)
-                sy = np.ceil(img_h * sc)
+                img_h, img_w = frame.shape[:2]
+                cx, cy = (img_w/2, img_h/2)
 
                 #Bound scaling so all objects remain in scene
                 if box.shape[-1] == 2: #Operate on point coordinates
                     mask = box[:,:,0] != -1
-                    sx = min(img_w, np.max(sc*box[mask,0]))
-                    sy = min(img_h, np.max(sc*box[mask,1]))
-
-                    if sx == img_w or sy == img_h:
-                        sc = min(sx/np.max(box[mask,0]), sy/np.max(box[mask,1]))
-                    else:
-                        sc = min(sx/np.min(box[mask,0]), sy/np.min(box[mask,1]))
 
-                    out_frame, out_box = self._scale_frame(box, frame, sc, sc)
+                    max_x = min(img_w, np.max(cx + sc * (box[mask,0] - cx)))
+                    min_x = max(0, np.min(cx + sc * (box[mask,0] - cx)))
+                    sx = (max_x - cx) / np.max(box[mask,0] - cx)
+                    if min_x == 0:
+                        sx = min(sx, (min_x - cx) / np.min(box[mask,0] - cx))
+
+                    max_y = min(img_h, np.max(cy + sc * (box[mask,1] - cy)))
+                    min_y = max(0, np.min(cy + sc * (box[mask,1] - cy)))
+                    sy = (max_y - cy) / np.max(box[mask,1] - cy)
+                    if min_y == 0:
+                        sy = min(sy, (min_y - cy) / np.min(box[mask,1] - cy))
+            
+                    sc = min(sx, sy)
+                    out_frame, out_box = self._scale_frame(box, frame, sc)
                     out_box[~mask] = -1*np.ones(2)
 
                 else: #Operate on bounding box 
-                    #bbox is bounding box object
                     mask = box[:,0] != -1
-                    sx = min(img_w, np.max(sc*box[mask,2]))
-                    sy = min(img_h, np.max(sc*box[mask,3]))
-                    
-                    if sx == img_w or sy == img_h:
-                        sc = min(sx/np.max(box[mask,2]), sy/np.max(box[mask,3]))
-                    else:
-                        sc = min(sx/np.min(box[mask,2]), sy/np.min(box[mask,3]))
-
-                    out_frame, out_box = self._scale_frame(box, frame, sc, sc)
+
+                    max_x = min(img_w, np.max(cx + sc * (box[mask,2] - cx)))
+                    min_x = max(0, np.min(cx + sc * (box[mask,0] - cx)))
+                    sx = (max_x - cx) / np.max(box[mask,2] - cx)
+                    if min_x == 0:
+                        sx = min(sx, (min_x - cx) / np.min(box[mask,0] - cx))
+
+                    max_y = min(img_h, np.max(cy + sc * (box[mask,3] - cy)))
+                    min_y = max(0, np.min(cy + sc * (box[mask,1] - cy)))
+                    sy = (max_y - cy) / np.max(box[mask,3] - cy)
+                    if min_y == 0:
+                        sy = min(sy, (min_y - cy) / np.min(box[mask,1] - cy))
+            
+                    sc = min(sx, sy)
+                    out_frame, out_box = self._scale_frame(box, frame, sc)
                     out_box[~mask] = -1*np.ones(4)
 
                 out_clip.append(out_frame)
@@ -758,11 +767,11 @@ def __call__(self, clip, bbox=[]):
             return out_clip, out_bbox 
         else:
             for frame in clip:
-                img_h, img_w, _ = frame.shape
+                img_h, img_w = frame.shape[:2]
                 sx = int(img_w * sc)
                 sy = int(img_h * sc) 
 
-                out_clip.append(self._scale_frame(None, frame, sc, sc))
+                out_clip.append(self._scale_frame(None, frame, sc))
             return out_clip 
 
 class SubtractMeanClip(PreprocTransform):

From c24d3eeb26d42eb9c998fcdf303768acab9450a3 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 11 Sep 2019 09:51:04 -0400
Subject: [PATCH 25/55] small fix

---
 datasets/preprocessing_transforms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 44d8542..b780073 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -684,7 +684,7 @@ class RandomZoomClip(PreprocTransform):
             - max_scale (float): maximum scaling on frame  
     """
     def __init__(self, scale, **kwargs):
-        super(RandomScaleClip, self).__init__(**kwargs)
+        super(RandomZoomClip, self).__init__(**kwargs)
 
         self.min_scale, self.max_scale = scale
 

From 3dccbfb91c0d7deb788165f479d6bb72dcf356a9 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sat, 14 Sep 2019 16:27:41 -0400
Subject: [PATCH 26/55] Add default arg for resume, save predictions in result
 folder, remove features arg from eval.py

---
 eval.py                      | 6 +++---
 models/c3d/c3d.py            | 6 ++++--
 models/c3d/config_test.yaml  | 2 ++
 models/c3d/config_train.yaml | 2 ++
 parse_args.py                | 3 ++-
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/eval.py b/eval.py
index 41ab417..8b1a57f 100644
--- a/eval.py
+++ b/eval.py
@@ -99,7 +99,7 @@ def eval(**args):
             x_input     = data['data'].to(device)
             annotations = data['annots']
 
-            outputs = model(x_input, features=True)
+            outputs = model(x_input)
 
             if ret_data is None:
                 ret_data   = outputs.cpu().numpy()
@@ -112,7 +112,7 @@ def eval(**args):
             # END IF
 
 
-            #acc = acc_metric.get_accuracy(outputs, annotations)
+            acc = acc_metric.get_accuracy(outputs, annotations)
 
             if step % 100 == 0:
                 print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc))
@@ -124,7 +124,7 @@ def eval(**args):
         ret_dict['data']   = ret_data
         ret_dict['labels'] = ret_labels
         import scipy.io as sio
-        sio.savemat(args['load_type']+'_'+args['dataset']+'.mat', ret_dict)
+        sio.savemat(os.path.join(result_dir,args['load_type']+'_'+args['dataset']+'.mat'), ret_dict)
 
         writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc)
         # Close Tensorboard Element
diff --git a/models/c3d/c3d.py b/models/c3d/c3d.py
index 387ffd7..dfd78fc 100644
--- a/models/c3d/c3d.py
+++ b/models/c3d/c3d.py
@@ -55,7 +55,9 @@ def __init__(self, **kwargs):
         if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']:
             self.__load_pretrained_weights()
 
-    def forward(self, x, labels=False, features=False):
+        self.features = kwargs['model_features']
+
+    def forward(self, x, labels=False):
         x = self.relu(self.conv1(x))
         x = self.pool1(x)
 
@@ -78,7 +80,7 @@ def forward(self, x, labels=False, features=False):
 
         x = self.relu(self.fc6(x))
 
-        if features:
+        if self.features:
             return x
 
         x = self.dropout(x)
diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml
index cf69c79..1d6d39d 100644
--- a/models/c3d/config_test.yaml
+++ b/models/c3d/config_test.yaml
@@ -36,3 +36,5 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
+
+model_features:    0                     # 1 - return model features (before prediction), 0 - return model prediction output
diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml
index 969982d..cf6f6e6 100644
--- a/models/c3d/config_train.yaml
+++ b/models/c3d/config_train.yaml
@@ -36,3 +36,5 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
+
+model_features:    0                     # 1 - return model features (before prediction), 0 - return model prediction output
diff --git a/parse_args.py b/parse_args.py
index 76b7ad4..7ac4994 100644
--- a/parse_args.py
+++ b/parse_args.py
@@ -77,7 +77,8 @@ def __init__(self):
             crop_type        = None,
             num_clips        = 1,
             debug            = 0,
-            seed             = 0)                       
+            seed             = 0,
+            resume           = 0)                       
 
 
 

From 830850107f63356abc1988ce31c13d8c8d5e5ae6 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 26 Sep 2019 16:57:39 -0400
Subject: [PATCH 27/55] Fix MSCOCO loading and gen_json script now works with
 Python3

---
 datasets/MSCOCO.py                  |  4 ++--
 datasets/VOC2007.py                 |  1 -
 datasets/scripts/gen_json_mscoco.py | 19 ++++++++++---------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/datasets/MSCOCO.py b/datasets/MSCOCO.py
index 5d9868f..5805a47 100644
--- a/datasets/MSCOCO.py
+++ b/datasets/MSCOCO.py
@@ -1,6 +1,6 @@
 import torch
 from .abstract_datasets import DetectionDataset 
-from PIL import Image
+import cv2
 import os
 import numpy as np
 import datasets.preprocessing_transforms as pt
@@ -62,7 +62,7 @@ def __getitem__(self, idx):
                 iscrowds[frame_ind, trackid]     = iscrowd
 
 
-            input_data.append(Image.open(os.path.join(base_path, frame_path)))
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1])
 
         vid_data, bbox_data = self.transforms(input_data, bbox_data)
 
diff --git a/datasets/VOC2007.py b/datasets/VOC2007.py
index d4764d8..b78c3c8 100644
--- a/datasets/VOC2007.py
+++ b/datasets/VOC2007.py
@@ -1,6 +1,5 @@
 import torch
 from .abstract_datasets import DetectionDataset 
-from PIL import Image
 import cv2
 import os
 import numpy as np
diff --git a/datasets/scripts/gen_json_mscoco.py b/datasets/scripts/gen_json_mscoco.py
index 1a6409a..c7db705 100755
--- a/datasets/scripts/gen_json_mscoco.py
+++ b/datasets/scripts/gen_json_mscoco.py
@@ -2,22 +2,25 @@
 import os
 
 
-def save_json(load_type):
+year = '2014'
 
+def save_json(load_type):
+    
     # Define path to mscoco images data
     base_img_path = '/path/to/mscoco/images/'       ###### REPLACE with path to dataset
     base_annot_path = '/path/to/mscoco/annotations/'###### REPLACE with path to dataset
 
-    f = open(os.path.join(base_annot_path,'instances_'+load_type+'2014.json'),'r')
-    x = json.load(f)
-    f.close()
+    save_location = '/path/to/save/location' ######### REPLACE with save path
+
+    with open(os.path.join(base_annot_path,'instances_'+load_type+year+'.json'),'r') as f:
+        x = json.load(f)
     
     imgids = [[idx['id'], idx['file_name'], idx['width'], idx['height']] for idx in x['images']]
     
     dd = {}
     for idx in imgids:
         frame_dict = dict(objs=[], img_path=idx[1]) 
-        dd[idx[0]] = dict(frames=[frame_dict], base_path=os.path.join(base_img_path,load_type+'2014'), frame_size=[idx[2],idx[3]])
+        dd[idx[0]] = dict(frames=[frame_dict], base_path=os.path.join(base_img_path,load_type+year), frame_size=[idx[2],idx[3]])
     
     
     print('finished imgids')
@@ -36,10 +39,8 @@ def save_json(load_type):
         if count%1000==0:
             print(count)
     
-    writef = open('mscoco_'+load_type+'.json', 'w')
-    json.dump(dd.values(), writef)
-    writef.close()
-    
+    with open(os.path.join(save_location,load_type+'.json'), 'w') as f:
+        json.dump(list(dd.values()), f)
                 
  
 save_json('train')

From 53421ae01e9a9ffc22c952f1093ecfdc87503ad9 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 26 Sep 2019 23:11:27 -0400
Subject: [PATCH 28/55] Add preprocessing test for zoom clip. Add/edit
 visualizations

---
 datasets/preprocessing_transforms.py | 120 +++++++++++++++++++++------
 1 file changed, 93 insertions(+), 27 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index b780073..63e1820 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -678,6 +678,10 @@ class RandomZoomClip(PreprocTransform):
     Random zoom on all frames in a clip. All frames receive same scaling
     Scale will be bounded by object bounding box (if given). Meaning, object will always be in view
 
+    >1: Zoom in
+    <1: Zoom out
+    =1: Same size
+
     Args:
         - scale (Tuple)
             - min_scale (float): minimum scaling on frame 
@@ -945,6 +949,7 @@ def __init__(self):
         self.rand_flip_v = RandomFlipClip(direction='v', p=1.0)
         self.rand_rot = RandomRotateClip(angles=[90])
         self.rand_trans = RandomTranslateClip(translate=(0.5,0.5))
+        self.rand_zoom  = RandomZoomClip(scale=(1.25,1.25)) 
         self.sub_mean = SubtractMeanClip(clip_mean=np.zeros(1))
         self.applypil = ApplyToPIL(transform=torchvision.transforms.ColorJitter, class_kwargs=dict(brightness=1))
         self.applypil2 = ApplyToPIL(transform=torchvision.transforms.FiveCrop, class_kwargs=dict(size=(64,64)))
@@ -1020,12 +1025,14 @@ def rand_flip_vis(self):
         x[:, 50] = 5000
         x[10, :] = 5000
         x[50, :] = 10000
-        plt.imshow(x); plt.show()
+
+        plt.subplot(1,3,1); plt.imshow(x); plt.title('Original image')
         h = self.rand_flip_h([x])
-        plt.imshow(h[0]); plt.show()
+        plt.subplot(1,3,2); plt.imshow(h[0]); plt.title('Flip Horizontal')
         v = self.rand_flip_v([x])
-        plt.imshow(v[0]); plt.show()
-
+        plt.subplot(1,3,3); plt.imshow(v[0]); plt.title('Flip Vertical')
+        
+        plt.show()
 
     def rand_rot_test(self):
         inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float)
@@ -1051,27 +1058,86 @@ def rand_trans_test(self):
 
     def rand_rot_vis(self):
         import matplotlib.pyplot as plt
-        self.rand_rot._update_angles([20])
+        import matplotlib.patches as patches 
+        self.rand_rot._update_angles([45])
         x = np.arange(112*112).reshape(112,112)
-        #x = np.arange(6*6).reshape(6,6)
-        #bbox = [51,51,61,61]
+
         bbox = [30,40,50,100]
-        bbox = [30,40,50,110]
-        #bbox = [2,2,4,4]
-        plt1 = x[:]
-        plt1[bbox[1]:bbox[3], bbox[0]] = 0
-        plt1[bbox[1]:bbox[3], bbox[2]-1] = 0
-        plt1[bbox[1], bbox[0]:bbox[2]] = 0
-        plt1[bbox[3]-1, bbox[0]:bbox[2]] = 0
-        plt.imshow(plt1); plt.show()
+        pts = np.array([[30,40],[30,80]])
+        fig = plt.figure()
+        ax1 = fig.add_subplot(121)
+        x[bbox[1]:bbox[3], bbox[0]] = 0
+        x[bbox[1]:bbox[3], bbox[2]-1] = 0
+        x[bbox[1], bbox[0]:bbox[2]] = 0
+        x[bbox[3]-1, bbox[0]:bbox[2]] = 0
+        
+        ax1.imshow(x); ax1.set_title('Original image')
+        rect = patches.Rectangle((bbox[0],bbox[1]), bbox[2]-bbox[0],\
+                                  bbox[3]-bbox[1], linewidth=1, edgecolor='k', facecolor='none')
+        #ax1.add_patch(rect)
+        ax1.scatter(pts[:,0], pts[:,1], c='r')
+
         out2 = self.rand_rot([x], np.array([[bbox]]))
-        plt2 = out2[0][0]
-        bbox = out2[1][0][0].astype(int)
-        plt2[bbox[1]:bbox[3], bbox[0]] = 0
-        plt2[bbox[1]:bbox[3], bbox[2]] = 0
-        plt2[bbox[1], bbox[0]:bbox[2]] = 0
-        plt2[bbox[3], bbox[0]:bbox[2]] = 0
-        plt.imshow(plt2); plt.show()
+        x_rot = out2[0][0]
+        bbox_rot = out2[1][0,0]
+
+        out2 = self.rand_rot([x], np.array([[pts]]))
+        pts_rot  = out2[1][0,0]
+
+        ax2 = fig.add_subplot(122)
+        rect = patches.Rectangle((bbox_rot[0],bbox_rot[1]), bbox_rot[2]-bbox_rot[0],\
+                                  bbox_rot[3]-bbox_rot[1], linewidth=1, edgecolor='k', facecolor='none')
+        ax2.add_patch(rect)
+        ax2.imshow(x_rot); ax2.set_title('Rotation')
+        ax2.scatter(pts_rot[:,0],pts_rot[:,1], c='r')
+        plt.show()
+
+    def rand_zoom_test(self):
+        inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float)
+        exp_out = np.array([[0.225   , 0.303125, 0.384375],
+                            [0.459375, 0.5375  , 0.61875 ],
+                            [0.703125, 0.78125 , 0.8625  ]]).astype(float)
+        out = self.rand_zoom(inp)
+
+        inp2 = np.arange(6*6, dtype=np.uint8).reshape(6,6)
+        bbox = [[2,2,4,4]]
+        exp_bbox = [1.75,1.75,4.25,4.25]
+        _,out_bbox = self.rand_zoom([inp2], np.array([bbox]))
+
+        assert (False not in np.isclose(out, exp_out)) and (False not in np.isclose(exp_bbox, out_bbox))
+
+    def rand_zoom_vis(self):
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as patches 
+        x = np.arange(112*112, dtype=np.uint8).reshape(112,112)
+
+        bbox = [30,40,50,100]
+        pts = np.array([[30,40],[30,80]])
+        fig = plt.figure()
+        ax1 = fig.add_subplot(121)
+
+        x[bbox[1]:bbox[3], bbox[0]] = 0
+        x[bbox[1]:bbox[3], bbox[2]-1] = 0
+        x[bbox[1], bbox[0]:bbox[2]] = 0
+        x[bbox[3]-1, bbox[0]:bbox[2]] = 0
+        ax1.imshow(x); ax1.set_title('Original image')
+        ax1.scatter(pts[:,0], pts[:,1], c='r')
+
+        out = self.rand_zoom([x], np.array([[pts]]))
+        pts_zoom = out[1][0][0]
+
+        out = self.rand_zoom([x], np.array([[bbox]]))
+        x_zoom = out[0][0]
+        bbox_zoom = out[1][0][0]
+
+        ax2 = fig.add_subplot(122)
+        rect = patches.Rectangle((bbox_zoom[0],bbox_zoom[1]), bbox_zoom[2]-bbox_zoom[0],\
+                                  bbox_zoom[3]-bbox_zoom[1], linewidth=1, edgecolor='k', facecolor='none')
+        ax2.add_patch(rect)
+        ax2.imshow(x_zoom); ax2.set_title('Zoomed image')
+        ax2.scatter(pts_zoom[:,0],pts_zoom[:,1], c='r')
+        
+        plt.show()
 
     def applypil_test(self):
         inp = np.arange(112*112).reshape(112,112)
@@ -1132,6 +1198,7 @@ def run_tests(self):
         self.rand_flip_test()
         self.rand_rot_test()
         self.rand_trans_test()
+        self.rand_zoom_test()
         self.applypil_test()
         self.applytensor_test()
         self.applycv_test()
@@ -1139,12 +1206,11 @@ def run_tests(self):
         self.to_pil_test()
         self.to_numpy_test()
         print("Tests passed")
-        #self.rand_flip_vis()
-        #self.rand_rot_vis()
-        
-
-
 
+        self.rand_flip_vis()
+        self.rand_rot_vis()
+        self.rand_zoom_vis()
+        
 if __name__=='__main__':
     test = TestPreproc()
     test.run_tests()

From 1fae0424f36531e6c477fadc8305f981afa44761 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 4 Oct 2019 10:13:32 -0400
Subject: [PATCH 29/55] Add numclips>0, clip stride, and clip_length=-1 options
 to extract clips. Also update all current datasets to allow for variable
 length video loading.

---
 config_default_example.yaml   |  4 +--
 datasets/HMDB51.py            |  5 +--
 datasets/ImageNetVID.py       | 10 +++---
 datasets/KTH.py               |  6 ++--
 datasets/MSCOCO.py            |  9 ++---
 datasets/Manual_Hands.py      | 12 ++++---
 datasets/UCF101.py            |  5 +--
 datasets/VOC2007.py           |  9 ++---
 datasets/abstract_datasets.py | 63 ++++++++++++++++++++++-------------
 models/c3d/config_test.yaml   |  2 +-
 models/c3d/config_train.yaml  |  2 +-
 models/ssd/config_test.yaml   |  2 +-
 parse_args.py                 |  8 +++++
 13 files changed, 85 insertions(+), 52 deletions(-)

diff --git a/config_default_example.yaml b/config_default_example.yaml
index 30b6ee9..a523504 100644
--- a/config_default_example.yaml
+++ b/config_default_example.yaml
@@ -1,15 +1,13 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive clips, must be >= 1 
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
 num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
 random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
 resize_shape:      [128,171]             # (Height, Width) to resize original data 
-sample_duration:   16                    # Temporal size of video to be provided as input to the model 
-sample_size:       112                   # Height of frame to be provided as input to the model
 subtract_mean:     ''                    # Subtract mean (R,G,B) from all frames during preprocessing
 
 # Experiment Setup 
diff --git a/datasets/HMDB51.py b/datasets/HMDB51.py
index 3d93bc3..6eb83a9 100644
--- a/datasets/HMDB51.py
+++ b/datasets/HMDB51.py
@@ -40,8 +40,9 @@ def __getitem__(self, idx):
         base_path = vid_info['base_path']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        labels     = np.zeros((self.clip_length))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
         input_data = []
     
         for frame_ind in range(len(vid_info['frames'])):
diff --git a/datasets/ImageNetVID.py b/datasets/ImageNetVID.py
index 1965c8c..ea180dc 100644
--- a/datasets/ImageNetVID.py
+++ b/datasets/ImageNetVID.py
@@ -42,10 +42,12 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data  = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels     = np.zeros((self.clip_length, self.max_objects))-1
-        occlusions = np.zeros((self.clip_length, self.max_objects))-1
+
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data  = np.zeros((vid_length, self.max_objects, 4))-1
+        labels     = np.zeros((vid_length, self.max_objects))-1
+        occlusions = np.zeros((vid_length, self.max_objects))-1
 
 
 
diff --git a/datasets/KTH.py b/datasets/KTH.py
index 95995ef..41c14a0 100644
--- a/datasets/KTH.py
+++ b/datasets/KTH.py
@@ -40,8 +40,10 @@ def __getitem__(self, idx):
         base_path = vid_info['base_path']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        labels     = np.zeros((self.clip_length))-1
+
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
         input_data = []
     
         for frame_ind in range(len(vid_info['frames'])):
diff --git a/datasets/MSCOCO.py b/datasets/MSCOCO.py
index 5d9868f..5b80cbf 100644
--- a/datasets/MSCOCO.py
+++ b/datasets/MSCOCO.py
@@ -34,10 +34,11 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data  = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels     = np.zeros((self.clip_length, self.max_objects))-1
-        iscrowds   = np.zeros((self.clip_length, self.max_objects))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data  = np.zeros((vid_length, self.max_objects, 4))-1
+        labels     = np.zeros((vid_length, self.max_objects))-1
+        iscrowds   = np.zeros((vid_length, self.max_objects))-1
 
 
 
diff --git a/datasets/Manual_Hands.py b/datasets/Manual_Hands.py
index d5c1a00..21916ef 100644
--- a/datasets/Manual_Hands.py
+++ b/datasets/Manual_Hands.py
@@ -46,11 +46,13 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data    = []
-        vid_data      = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data     = np.zeros((self.clip_length, self.max_objects, 4))-1
-        hand_pts_data = np.zeros((self.clip_length, self.max_objects, 21, 3))-1
-        labels        = np.zeros((self.clip_length, self.max_objects))-1
-        occlusions    = np.zeros((self.clip_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points
+
+        vid_length = len(vid_info['frames'])
+        vid_data      = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data     = np.zeros((vid_length, self.max_objects, 4))-1
+        hand_pts_data = np.zeros((vid_length, self.max_objects, 21, 3))-1
+        labels        = np.zeros((vid_length, self.max_objects))-1
+        occlusions    = np.zeros((vid_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points
 
         for frame_ind in range(len(vid_info['frames'])):
             frame          = vid_info['frames'][frame_ind]
diff --git a/datasets/UCF101.py b/datasets/UCF101.py
index 28ef78d..40c8b58 100644
--- a/datasets/UCF101.py
+++ b/datasets/UCF101.py
@@ -41,8 +41,9 @@ def __getitem__(self, idx):
         base_path = vid_info['base_path']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        labels     = np.zeros((self.clip_length))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
         input_data = []
     
         for frame_ind in range(len(vid_info['frames'])):
diff --git a/datasets/VOC2007.py b/datasets/VOC2007.py
index d4764d8..7d90145 100644
--- a/datasets/VOC2007.py
+++ b/datasets/VOC2007.py
@@ -53,10 +53,11 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data  = []
-        vid_data    = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data   = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels      = np.zeros((self.clip_length, self.max_objects))-1
-        diff_labels = np.zeros((self.clip_length, self.max_objects)) #difficult object labels
+        vid_length = len(vid_info['frames'])
+        vid_data    = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data   = np.zeros((vid_length, self.max_objects, 4))-1
+        labels      = np.zeros((vid_length, self.max_objects))-1
+        diff_labels = np.zeros((vid_length, self.max_objects)) #difficult object labels
 
         for frame_ind in range(len(vid_info['frames'])):
             frame      = vid_info['frames'][frame_ind]
diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py
index 6e7fe2e..c34fc7f 100644
--- a/datasets/abstract_datasets.py
+++ b/datasets/abstract_datasets.py
@@ -68,17 +68,22 @@ def _extractClips(self, video):
             self.clip_stride: Number of frames between clips when extracting them from videos 
             self.random_offset: Randomly select a clip_length sized clip from a video
         """
+        if self.clip_offset > 0:
+            if len(video)-self.clip_offset >= self.clip_length:
+                video = video[self.clip_offset:]
+
         if self.num_clips < 0:
             if len(video) >= self.clip_length:
+                # Uniformly sample one clip from the video
                 final_video = [video[_idx] for _idx in np.linspace(0, len(video)-1, self.clip_length, dtype='int32')]
                 final_video = [final_video]
 
             else:
                 # Loop if insufficient elements
-                indices = np.ceil(self.clip_length/float(len(video)))
+                indices = np.ceil(self.clip_length/float(len(video))) # Number of times to repeat the video to exceed one clip_length
                 indices = indices.astype('int32')
-                indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices)
-                indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')]
+                indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices) # Repeat the video indices until it exceeds a clip_length
+                indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')] # Uniformly sample clip_length frames from the looped video
 
                 final_video = [video[_idx] for _idx in indices]
                 final_video = [final_video]
@@ -87,8 +92,9 @@ def _extractClips(self, video):
             # END IF
 
         elif self.num_clips == 0:
+            # Divide entire video into the max number of clip_length segments
             if len(video) >= self.clip_length:
-                indices     = np.arange(start=0, stop=len(video), step=self.clip_length)
+                indices     = np.arange(start=0, stop=len(video)-self.clip_length+1, step=self.clip_stride)
                 final_video = []
 
                 for _idx in indices:
@@ -110,33 +116,44 @@ def _extractClips(self, video):
             # END IF                               
     
         else:
-            if self.random_offset:
-                if len(video) >= self.clip_length:
-                    indices = np.random.choice(np.arange(len(video) - self.clip_length + 1), 1)
-                    indices = indices.astype('int32')
-                    indices = np.arange(indices, indices + self.clip_length).astype('int32') 
+            # num_clips > 0, select exactly num_clips from a video
 
-                    final_video = [video[_idx] for _idx in indices]
-                    final_video = [final_video]
+            if self.clip_length == -1:
+                # This is a special case where we will return the entire video
+                # This setting can only be used when the batch size is set to 1
+                return [video]
 
-                else:
-                    indices = np.ceil(self.clip_length/float(len(video)))
-                    indices = indices.astype('int32')
-                    indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices)
 
-                    index   = np.random.choice(np.arange(len(indices) - self.clip_length + 1), 1)[0]
-                    index   = index.astype('int32')
-                    indices = indices[index:index + self.clip_length]
+            required_length = (self.num_clips-1)*(self.clip_stride)+self.clip_length
+
+
+            if self.random_offset:
+                if len(video) >= required_length:
+                    vid_start = np.random.choice(np.arange(len(video) - required_length + 1), 1)
+                    video = video[int(vid_start):]
 
-                    final_video = [video[_idx] for _idx in indices]
-                    final_video = [final_video]
+            if len(video) >= required_length:
+                # Get indices of sequential clips overlapped by a clip_stride number of frames
+                indices = np.arange(0, len(video), self.clip_stride)
 
-                # END IF
+                # Select only the first num clips
+                indices = indices.astype('int32')[:self.num_clips]
+
+                video = np.array(video)
+                final_video = [video[np.arange(_idx, _idx+self.clip_length).astype('int32')].tolist() for _idx in indices]
 
             else:
-                final_video = video[:self.clip_length]
-                final_video = [final_video]
+                # If the video is too small to get num_clips given the clip_length and clip_stride, loop it until you can
+                indices = np.ceil(required_length /float(len(video)))
+                indices = indices.astype('int32')
+                indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices)
+
+                # Starting index of each clip
+                clip_starts = np.arange(0, len(indices), self.clip_stride).astype('int32')[:self.num_clips]
 
+                video = np.array(video)
+                final_video = [video[indices[_idx:_idx+self.clip_length]].tolist() for _idx in clip_starts]
+            
             # END IF
 
         # END IF
diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml
index 1d6d39d..08f4e1c 100644
--- a/models/c3d/config_test.yaml
+++ b/models/c3d/config_test.yaml
@@ -1,7 +1,7 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive frames
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml
index cf6f6e6..65a0cb2 100644
--- a/models/c3d/config_train.yaml
+++ b/models/c3d/config_train.yaml
@@ -1,7 +1,7 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive frames
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
diff --git a/models/ssd/config_test.yaml b/models/ssd/config_test.yaml
index a521b6d..2b630e2 100644
--- a/models/ssd/config_test.yaml
+++ b/models/ssd/config_test.yaml
@@ -1,7 +1,7 @@
 # Preprocessing
 clip_length:       1                                   # Number of frames within a clip 
 clip_offset:       0                                   # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                                   # Frame offset between successive frames
+clip_stride:       1                                   # Frame offset between successive frames
 crop_shape:        [112,112]                           # (Height, Width) of frame  
 crop_type:         None                                # Type of cropping operation (Random, Central and None)  
 final_shape:       [300,300]                           # (Height, Width) of input to be given to CNN
diff --git a/parse_args.py b/parse_args.py
index 7ac4994..9a2500d 100644
--- a/parse_args.py
+++ b/parse_args.py
@@ -111,4 +111,12 @@ def get_args(self):
                 if k not in yaml_keys:
                     self.cfg_args[k] = self.defaults[k]
 
+
+        # Force clip_stride to be >= 1 when extracting clips from a video
+        # This represents the # of frames between successive clips 
+        if self.cfg_args['clip_stride'] < 1:
+            self.cfg_args['clip_stride'] = 1
+
+
+
         return self.cfg_args

From fb7262a2a32e60aa56cae9de27ac28162ebd6e0a Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Fri, 4 Oct 2019 17:18:44 -0400
Subject: [PATCH 30/55] Add note of rotation angle

---
 datasets/preprocessing_transforms.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 63e1820..78d0f4a 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -1059,7 +1059,8 @@ def rand_trans_test(self):
     def rand_rot_vis(self):
         import matplotlib.pyplot as plt
         import matplotlib.patches as patches 
-        self.rand_rot._update_angles([45])
+        angle = 45
+        self.rand_rot._update_angles([angle])
         x = np.arange(112*112).reshape(112,112)
 
         bbox = [30,40,50,100]
@@ -1088,7 +1089,7 @@ def rand_rot_vis(self):
         rect = patches.Rectangle((bbox_rot[0],bbox_rot[1]), bbox_rot[2]-bbox_rot[0],\
                                   bbox_rot[3]-bbox_rot[1], linewidth=1, edgecolor='k', facecolor='none')
         ax2.add_patch(rect)
-        ax2.imshow(x_rot); ax2.set_title('Rotation')
+        ax2.imshow(x_rot); ax2.set_title('Rotation: {} degress'.format(angle))
         ax2.scatter(pts_rot[:,0],pts_rot[:,1], c='r')
         plt.show()
 

From 4dd4ea9c3061adbf13b0f31a4a7c4758d5d4afe0 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sat, 5 Oct 2019 20:31:33 -0400
Subject: [PATCH 31/55] most general change for multiple inputs to model
 forward functions

---
 eval.py    | 12 ++++++++----
 metrics.py |  5 ++++-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/eval.py b/eval.py
index 4ea504b..d575799 100644
--- a/eval.py
+++ b/eval.py
@@ -91,11 +91,15 @@ def eval(**args):
     with torch.no_grad():
         for step, data in enumerate(eval_loader):
             x_input     = data['data']
-            for i, item in enumerate(x_input):
-                if isinstance(item, torch.Tensor):
-                    x_input[i] = item.to(device)
             annotations = data['annots']
-            outputs = model(*x_input)
+
+            if isinstance(x_input, torch.Tensor):
+                outputs = model(x_input.to(device))
+            else:
+                for i, item in enumerate(x_input):
+                    if isinstance(item, torch.Tensor):
+                        x_input[i] = item.to(device)
+                outputs = model(*x_input)
 
             acc = acc_metric.get_accuracy(outputs, annotations)
 
diff --git a/metrics.py b/metrics.py
index 1f4daf9..0c137c4 100644
--- a/metrics.py
+++ b/metrics.py
@@ -545,11 +545,14 @@ def get_accuracy(self, predictions, data):
             predictions: (Tensor, shape [N,W,T,D]), attention weight output from model
             data:      (dictionary)
                 - rpn_original      (Tensor, shape [N,T,D,4]) 
-                - box               (Tensor, shape [N,T,D,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) 
+                - box               (Tensor, shape [N,O,T,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) 
                 - box_label         (Tensor, shape [N,W]) 
                 - vis_name          (List, shape [N]), unique segment identifier  
                 - class_labels_dict (dict, length 67) class index to class label mapping 
 
+            T: number of frames
+            D: dimension of features
+            O: number of objects to ground 
             W: unique word in segment (from YC2BB class dictionary)
         Return:
            Box accuracy score  

From 7a87f4da20087ed6412425a27db8d8bf3e57e7d8 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sun, 6 Oct 2019 21:15:23 -0400
Subject: [PATCH 32/55] Added assertion for batch size of 1 and clip length -1

---
 datasets/abstract_datasets.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py
index c34fc7f..a159b25 100644
--- a/datasets/abstract_datasets.py
+++ b/datasets/abstract_datasets.py
@@ -40,6 +40,9 @@ def __init__(self, *args, **kwargs):
         self.crop_type      = kwargs['crop_type'] 
         self.final_shape    = kwargs['final_shape']
 
+        #Experiment arguments
+        self.batch_size     = kwargs['batch_size']
+
         # Creates the self.samples list which will be indexed by each __getitem__ call
         self._getClips()
 
@@ -120,7 +123,10 @@ def _extractClips(self, video):
 
             if self.clip_length == -1:
                 # This is a special case where we will return the entire video
-                # This setting can only be used when the batch size is set to 1
+
+                # Batch size must equal one or dataloader items may have varying lengths 
+                # and can't be stacked i.e. throws an error
+                assert(self.batch_size == 1) 
                 return [video]
 
 

From 3b25314ba7343bfe44636e13f1ad26f44fd0c7dc Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Sun, 6 Oct 2019 21:45:02 -0400
Subject: [PATCH 33/55] Computes accuracy on validation, needs to write to JSON
 file for testing

---
 eval.py    | 14 --------------
 metrics.py |  6 ------
 2 files changed, 20 deletions(-)

diff --git a/eval.py b/eval.py
index ed9e8bb..0df0b8b 100644
--- a/eval.py
+++ b/eval.py
@@ -107,14 +107,6 @@ def eval(**args):
                         x_input[i] = item.to(device)
                 outputs = model(*x_input)
 
-            if ret_data is None:
-                ret_data   = outputs.cpu().numpy()
-                ret_labels = annotations['labels'].cpu().numpy()[:, 0]
-
-            else:
-                ret_data   = np.vstack((ret_data, outputs.cpu().numpy()))
-                ret_labels = np.hstack((ret_labels, annotations['labels'].cpu().numpy()[:, 0]))
-
             # END IF
 
 
@@ -126,12 +118,6 @@ def eval(**args):
     print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc))
 
     if not args['debug']:
-        ret_dict = {}
-        ret_dict['data']   = ret_data
-        ret_dict['labels'] = ret_labels
-        import scipy.io as sio
-        sio.savemat(os.path.join(result_dir,args['load_type']+'_'+args['dataset']+'.mat'), ret_dict)
-
         writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc)
         # Close Tensorboard Element
         writer.close()
diff --git a/metrics.py b/metrics.py
index 0c137c4..3b56bd9 100644
--- a/metrics.py
+++ b/metrics.py
@@ -637,11 +637,6 @@ def get_accuracy(self, predictions, data):
         for (i,h,m) in results:
             self.ba_score[i].append((h,m))
         
-
-        self.count += N
-        if self.count < self.ndata:
-            return -1
-        
         if self.test_mode: #Annotations for the testing split are not publicly available
             return -1
 
@@ -657,5 +652,4 @@ def get_accuracy(self, predictions, data):
                 #print('BA for {}(...): {:.4f}'.format(k, cur_hit/(cur_hit+cur_miss)))
                 ba_final.append(cur_hit/(cur_hit+cur_miss))
 
-        #print('The overall BA is: {:.4f}'.format(np.mean(ba_final))) 
         return np.mean(ba_final)

From ba2edb33566a62bebe91d225ec83520cf5cf46dd Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Tue, 8 Oct 2019 17:42:25 -0400
Subject: [PATCH 34/55] Generate JSON submissions file for test split to submit
 to eval server

---
 datasets/YC2BB.py |  6 +++---
 metrics.py        | 52 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
index 0f511f8..0123a0e 100644
--- a/datasets/YC2BB.py
+++ b/datasets/YC2BB.py
@@ -26,7 +26,7 @@ def __init__(self, *args, **kwargs):
 
         self.load_type = kwargs['load_type']
 
-        self.max_objects = 15 
+        self.max_objects = 20 
         self.class_dict  = _get_class_labels(class_file)
         '''
         if self.load_type=='train':
@@ -124,10 +124,10 @@ def __getitem__(self, idx):
                 trackid = obj['trackid']
 
                 if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated
-                    bbox_data[trackid, frame_ind] = -1*np.ones(5) 
+                    bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] 
                 else:
                     if obj['occ'] or obj['outside']:
-                        bbox_data[trackid, frame_ind] = -1*np.ones(5) 
+                        bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] 
                     else:   
                         obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
 
diff --git a/metrics.py b/metrics.py
index 3b56bd9..d051bd7 100644
--- a/metrics.py
+++ b/metrics.py
@@ -1,6 +1,9 @@
-import torch
+import os
+import json 
 import numpy as np
 
+import torch
+
 class Metrics(object):
     def __init__(self, *args, **kwargs):
         """
@@ -530,14 +533,26 @@ class Box_Accuracy():
     def __init__(self, *args, **kwargs):
         from collections import defaultdict
 
-        self.thresh = kwargs['accu_thresh']
-        self.fps    = kwargs['fps']
-        self.test_mode = 1 if kwargs['load_type'] == 'test' else 0
-        self.IOU      = IOU()
-        self.ba_score = defaultdict(list) #box accuracy metric
+        self.result_dir = os.path.join(kwargs['result_dir'], 'submission_yc2_bb.json')
+        self.thresh     = kwargs['accu_thresh']
+        self.fps        = kwargs['fps']
+        self.debug      = kwargs['debug']
+        self.test_mode  = 1 if kwargs['load_type'] == 'test' else 0
+        self.IOU        = IOU()
+        self.ba_score   = defaultdict(list) #box accuracy metric
+
+        if self.test_mode:
+            print('*'*62)
+            print('* [WARNING] Eval unavailable for the test set! *\
+                 \n* Results will be saved to: '+self.result_dir+' *\
+                 \n* Please submit your results to the eval server!  *')
+            print('*'*62)
 
         self.ndata = kwargs['ndata']
         self.count = 0
+        
+        self.json_data = {}
+        self.database  = {}
 
     def get_accuracy(self, predictions, data):
         """
@@ -560,6 +575,7 @@ def get_accuracy(self, predictions, data):
         attn_weights = predictions
 
         N = attn_weights.shape[0] 
+        self.count += N
 
         rpn_batch         = data['rpn_original']
         box_batch         = data['box']
@@ -636,8 +652,28 @@ def get_accuracy(self, predictions, data):
         
         for (i,h,m) in results:
             self.ba_score[i].append((h,m))
-        
-        if self.test_mode: #Annotations for the testing split are not publicly available
+
+        #Annotations for the testing split are not publicly available
+        if self.test_mode: 
+            split, rec, video_name, segment = vis_name[0].split('_-_')
+
+            if video_name not in self.database:
+                self.database[video_name] = {}
+                self.database[video_name]['recipe_type'] = rec
+            if 'segments' not in self.database[video_name]:
+                self.database[video_name]['segments'] = {}
+
+            self.database[video_name]['segments'][int(segment)] = segment_dict 
+
+            #Predictions will be saved to JSON file (if not in debug mode)
+            if self.count >= self.ndata and not self.debug:
+                self.json_data['database'] = self.database
+
+                with open(self.result_dir, 'w') as f:
+                    json.dump(self.json_data, f)
+
+                print('Saved submission file to: {}'.format(self.result_dir))
+
             return -1
 
         ba_final = []

From b89da503804463e337fb59ab67aa4142be83c960 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 9 Oct 2019 11:17:16 -0400
Subject: [PATCH 35/55] Add training for YC2BB model, need to clean-up and add
 comments

---
 datasets/YC2BB.py   | 144 ++++++++++++++++++++++++++++++++------------
 losses.py           |  56 ++++++++++++++++-
 models/dvsa/dvsa.py |   9 ++-
 train.py            |  30 ++++++---
 4 files changed, 186 insertions(+), 53 deletions(-)

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
index 0123a0e..d15d5d4 100644
--- a/datasets/YC2BB.py
+++ b/datasets/YC2BB.py
@@ -23,17 +23,13 @@ def __init__(self, *args, **kwargs):
         num_proposals        = kwargs['yc2bb_num_proposals']
         rpn_proposal_root    = kwargs['yc2bb_rpn_proposal_root']
         roi_pooled_feat_root = kwargs['yc2bb_roi_pooled_feat_root']
+        self.num_frm         = kwargs['yc2bb_num_frm']
 
         self.load_type = kwargs['load_type']
 
         self.max_objects = 20 
+        self.num_class   = kwargs['labels']
         self.class_dict  = _get_class_labels(class_file)
-        '''
-        if self.load_type=='train':
-            self.transforms = kwargs['model_obj'].train_transforms
-        else:
-            self.transforms = kwargs['model_obj'].test_transforms
-        '''
 
         sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type)
 
@@ -64,35 +60,38 @@ def __init__(self, *args, **kwargs):
         self.num_proposals = num_proposals
         self.roi_pooled_feat_root = roi_pooled_feat_root
 
-        '''
-        with open(self.gt_box_file, 'r') as f:
-            self.data_all = json.load(f)
-
-	 # read gt bounding boxes O x T/25 x (id, ytl, xtl, ybr, xbr)
-        # coordinates are 0-indexed
-        for i, t in enumerate(segments_tuple):
-            vid = t[2]
-            seg = str(int(t[3]))
-
-            # if video has no annotations, continue
-            if not vid in self.data_all['database']:
-                continue
-
-            # check if ground truth bounding box exists for segment
-            if seg in self.data_all['database'][vid]['segments'].keys():
-                s = sentences_proc[i]
-                inc_flag = 0
+        #Extract all dictionary words from each input sentence
+        #Only for the training set b/c it's un-annotated
+        self.sample_obj_labels = []
+        idx_to_remove = []
+        if self.load_type == 'train':
+            total_seg = len(self.samples)
+            for idx, sample in enumerate(self.samples):
+                sentence = sample['frames'][0]['sentence'].split(' ')
                 obj_label = []
-                for w in s:
-                    if self.class_dict.get(w, -1) >= 0:
-                        obj_label.append(self.class_dict[w])
+                inc_flag = 0
+                for w in sentence:
+                    if self.class_dict.get(w,-1) >= 0:
+                        obj_label.append(self.class_dict[w]) 
                         inc_flag = 1
 
                 if inc_flag:
-                    self.sample_lst.append((t, obj_label))
+                    self.sample_obj_labels.append(obj_label)
+                else:
+                    idx_to_remove.append(idx)
+
+            #Remove segments without object from dictionay
+            self.samples[:] = [s for idx,s in enumerate(self.samples) if idx not in idx_to_remove]
+
+            assert(len(self.samples) == len(self.sample_obj_labels))
+
+            print('{}/{} valid segments in {} split'.format(len(self.samples), total_seg, self.load_type))
 
-        print('# of segments for {}: {}, percentage in the raw data: {:.2f}'.format(
-               split_lst, len(self.sample_lst), len(self.sample_lst)/len(sentences_proc)))
+        '''
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
         '''
 
     #Reverse-mapping between class index to canonical label name
@@ -192,15 +191,84 @@ def __getitem__(self, idx):
         vis_name = '_-_'.join((self.yc2_split, rec, vid, seg))
 
         ret_dict = dict()
-        ret_dict['data']     = [x_rpn, obj_label, self.load_type] 
-
         annot_dict = dict()
-        annot_dict['box']               = bbox_data 
-        annot_dict['box_label']         = obj_label 
-        annot_dict['rpn']               = rpn
-        annot_dict['rpn_original']      = rpn_original 
-        annot_dict['vis_name']          = vis_name
-        annot_dict['class_labels_dict'] = self._get_class_labels_reverse()
+
+        if self.load_type == 'train': #Training input data is generated differently
+            # randomly sample 5 frames from 5 uniform intervals
+            T = x_rpn.size(1)
+            itv = T*1./self.num_frm
+            ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)]
+            x_rpn = x_rpn[:, ind, :]
+
+            obj_label = self.sample_obj_labels[idx]
+
+            #Generate positive example
+            obj_tensor = torch.tensor(obj_label, dtype=torch.long)
+            obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding
+            pos_sample = [x_rpn, obj_tensor]
+
+            #Sample negative example 
+            total_s = len(self.samples)
+            neg_index = np.random.randint(total_s)
+            #Shouldn't include any overlapping object
+            while len(set(obj_label).intersection(set(self.sample_obj_labels[neg_index]))) != 0:
+                neg_index = np.random.randint(total_s)
+
+            vid_info = self.samples[neg_index]
+            
+            base_path       = vid_info['base_path']
+            width, height   = vid_info['frame_size']
+            num_frames_1fps = len(vid_info['frames'])
+            rec             = base_path.split('/')[-3]
+            vid             = base_path.split('/')[-2]
+            seg             = base_path.split('/')[-1]
+
+            # rpn object propoals
+            rpn = []
+            x_rpn = []
+            frm=1
+
+            feat_name = vid+'_'+seg+'.pth'
+            img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+            x_rpn = torch.load(os.path.join(self.roi_pooled_feat_root, self.yc2_split, feat_name))
+            while self.rpn_dict.get(img_name, -1) > -1:
+                ind = self.rpn_dict[img_name]
+                rpn.append(self.rpn_chunk[ind])
+                frm+=1
+                img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+
+            rpn = torch.stack(rpn) # number of frames x number of proposals per frame x 4
+            rpn = rpn[:, :self.num_proposals, :]
+
+            x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals
+            x_rpn = x_rpn[:, :, :self.num_proposals]
+
+            # randomly sample 5 frames from 5 uniform intervals
+            T = x_rpn.size(1)
+            itv = T*1./self.num_frm
+            ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)]
+            x_rpn = x_rpn[:, ind, :]
+
+            #Generate negative example
+            neg_obj_label = self.sample_obj_labels[neg_index]
+            obj_tensor = torch.tensor(neg_obj_label, dtype=torch.long)
+            obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(neg_obj_label)).fill_(self.num_class))) #padding
+            neg_sample = [x_rpn, obj_tensor]
+
+            output = [torch.stack(i) for i in zip(pos_sample, neg_sample)]
+            output.append(self.load_type)
+            ret_dict['data'] = output 
+
+        else: #Validation or Testing set
+            ret_dict['data']     = [x_rpn, obj_label, self.load_type] 
+
+            annot_dict['box']               = bbox_data 
+            annot_dict['box_label']         = obj_label 
+            annot_dict['rpn']               = rpn
+            annot_dict['rpn_original']      = rpn_original 
+            annot_dict['vis_name']          = vis_name
+            annot_dict['class_labels_dict'] = self._get_class_labels_reverse()
+
         ret_dict['annots']         = annot_dict
 
         return ret_dict
diff --git a/losses.py b/losses.py
index c11d14d..2b2a3cb 100644
--- a/losses.py
+++ b/losses.py
@@ -1,10 +1,12 @@
-import torch 
-import torch.nn    as nn
 import numpy as np
 from scipy import ndimage
 import os
 import cv2
 
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F
+
 
 class Losses(object):
     def __init__(self, *args, **kwargs): #loss_type, size_average=None, reduce=None, reduction='mean', *args, **kwargs):
@@ -27,6 +29,9 @@ def __init__(self, *args, **kwargs): #loss_type, size_average=None, reduce=None,
         elif self.loss_type == 'M_XENTROPY':
             self.loss_object = M_XENTROPY(*args, **kwargs)
 
+        elif self.loss_type == 'YC2BB_Attention_Loss':
+            self.loss_object = YC2BB_Attention_Loss(*args, **kwargs)
+
         else:
             print('Invalid loss type selected. Quitting!')
             exit(1)
@@ -107,3 +112,50 @@ def loss(self, predictions, data):
         one_hot = torch.Tensor(one_hot).cuda()
 
         return torch.mean(torch.sum(-one_hot * self.logsoftmax(predictions), dim=1))
+
+class YC2BB_Attention_Loss(object):
+    def __init__(self, *args, **kwargs):
+       """
+       Frame-wise attention loss used in:
+       
+       Weakly-supervised, no groundtruth labels are used.
+       """
+
+       self.loss_weighting = kwargs['has_loss_weighting']
+       self.obj_interact   = kwargs['obj_interact']
+       self.ranking_margin = kwargs['ranking_margin']
+       self.loss_factor    = kwargs['loss_factor']
+
+    def loss(self, predictions, data):
+        """
+        Args:
+            predictions (List): 
+                - output (): 
+                - loss weighting():
+            data        (NoneType)
+
+        Return:
+            Frame-wise weighting loss 
+        """
+        output, loss_weigh = predictions
+
+        if self.loss_weighting or self.obj_interact: 
+            rank_batch = F.margin_ranking_loss(output[:,0:1], output[:,1:2], 
+                torch.ones(output.size()).type(output.data.type()), margin=self.ranking_margin, reduction='none')
+            if self.loss_weighting and self.obj_interact:
+                loss_weigh = (output[:, 0:1]+loss_weigh)/2. # avg
+            elif self.loss_weighting:
+                loss_weigh = output[:,0:1]
+            else:
+                loss_weigh = loss_weigh.unsqueeze(1)
+            # ranking loss
+            cls_loss = self.loss_factor*(rank_batch*loss_weigh).mean()+ \
+                        (1-self.loss_factor)*-torch.log(2*loss_weigh).mean()
+        else:
+            # ranking loss
+            cls_loss = F.margin_ranking_loss(output[:,0:1], output[:,1:2],
+                torch.Tensor([[1],[1]]).type(output.data.type()), margin=self.ranking_margin)
+
+
+        return cls_loss
+
diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py
index 0543036..2092345 100644
--- a/models/dvsa/dvsa.py
+++ b/models/dvsa/dvsa.py
@@ -1,7 +1,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.autograd import Variable
 import math
 import numpy as np
 from functools import partial
@@ -14,7 +13,7 @@ class DVSA(nn.Module):
 #    def __init__(self, num_class, input_size=2048, enc_size=128, dropout=0.2, hidden_size=256, n_layers=1, n_heads=4, attn_drop=0.2, num_frm=5, has_loss_weighting=False):
     def __init__(self, **kwargs):
         super().__init__()
-        num_class          = kwargs['num_class']
+        num_class          = kwargs['labels']
         input_size         = kwargs['input_size']
         enc_size           = kwargs['enc_size']
         dropout            = kwargs['dropout']
@@ -22,7 +21,7 @@ def __init__(self, **kwargs):
         n_layers           = kwargs['n_layers']
         n_heads            = kwargs['n_heads']
         attn_drop          = kwargs['attn_drop']
-        num_frm            = kwargs['num_frm']
+        num_frm            = kwargs['yc2bb_num_frm']
         has_loss_weighting = kwargs['has_loss_weighting']
         
         # encode the region feature
@@ -62,6 +61,10 @@ def forward(self, x_o, obj, load_type):
         if is_evaluate:
             return self.output_attn(x_o, obj)
 
+        #only a single batch expected
+        x_o = x_o[0]  
+        obj = obj[0]
+
         x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous()
 
         x_o = torch.stack([x_o[0], x_o[1], x_o[0]])
diff --git a/train.py b/train.py
index 7f66ef4..b68440e 100644
--- a/train.py
+++ b/train.py
@@ -126,7 +126,7 @@ def train(**args):
         # END IF
             
         model_loss = Losses(device=device, **args)
-        acc_metric = Metrics(**args)
+        acc_metric = Metrics(**args, result_dir=result_dir, ndata=len(valid_loader.dataset))
         best_val_acc = 0.0
 
     ############################################################################################################################################################################
@@ -148,17 +148,27 @@ def train(**args):
 
                 # END IF
 
-                x_input       = data['data'].to(device) 
-                annotations   = data['annots'] 
+                x_input       = data['data'] 
+                annotations   = data['annots']
+
+                if isinstance(x_input, torch.Tensor):
+                    mini_batch_size = x_input.shape[0]
+                    outputs = model(x_input.to(device))
+
+                    assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument"
+                else: #Model takes several inputs in forward function 
+                    mini_batch_size = x_input[0].shape[0] #Assuming the first element contains the true data input 
+                    for i, item in enumerate(x_input):
+                        if isinstance(item, torch.Tensor):
+                            x_input[i] = item.to(device)
+                    outputs = model(*x_input)
 
-                assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument"
-                outputs = model(x_input)
                 loss    = model_loss.loss(outputs, annotations)
-                loss    = loss * outputs.shape[0] 
+                loss    = loss * mini_batch_size 
                 loss.backward()
 
                 running_loss  += loss.item()
-                running_batch += outputs.shape[0]
+                running_batch += mini_batch_size
 
                 if np.isnan(running_loss):
                     import pdb; pdb.set_trace()
@@ -173,12 +183,12 @@ def train(**args):
                     # END FOR
                 
                     # Add Loss Element
-                    writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/outputs.shape[0], epoch*len(train_loader) + step)
+                    writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/mini_batch_size, epoch*len(train_loader) + step)
 
                 # END IF
 
-                if ((epoch*len(train_loader) + step+1) % 100 == 0):
-                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/outputs.shape[0]))
+                if ((epoch*len(train_loader) + step+1) % 1 == 0):
+                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/mini_batch_size))
 
                 # END IF
 

From 2dc132f2f4f5eade2086a0128cf21b67e8be5fca Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 9 Oct 2019 16:04:13 -0400
Subject: [PATCH 36/55] Clean up parts of code, added comments and code sources

---
 datasets/YC2BB.py   | 52 +++++++++++++++++++++++----------------------
 losses.py           | 11 ++++++----
 models/dvsa/dvsa.py | 15 +++++++++----
 train.py            | 18 ++++++++++++----
 4 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
index d15d5d4..0840a06 100644
--- a/datasets/YC2BB.py
+++ b/datasets/YC2BB.py
@@ -11,8 +11,12 @@
 
 class YC2BB(DetectionDataset):
     '''
-    YouCook2-Bounding Boxes dataset. Used in weakly-supervised video object grounding task
+    YouCook2-Bounding Boxes dataset. Introduced in weakly-supervised video object grounding task
     Paper: https://arxiv.org/pdf/1805.02834.pdf
+
+    training: no bounding box annotations, only sentence describing sentence
+    validation: bounding box annotations and grounded words available
+    testing: bounding box annotations not publicly available, only grounded words
     '''
     def __init__(self, *args, **kwargs):
         super(YC2BB, self).__init__(*args, **kwargs)
@@ -97,6 +101,23 @@ def __init__(self, *args, **kwargs):
     #Reverse-mapping between class index to canonical label name
     def _get_class_labels_reverse(self):
         return {v:k for k,v in self.class_dict.items()}
+    
+    #For the training set, extract positive and negative samples
+    def sample_rpn_regions(self, x_rpn, idx):
+        # randomly sample 5 frames from 5 uniform intervals
+        T = x_rpn.size(1)
+        itv = T*1./self.num_frm
+        ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)]
+        x_rpn = x_rpn[:, ind, :]
+
+        obj_label = self.sample_obj_labels[idx]
+
+        #Generate example
+        obj_tensor = torch.tensor(obj_label, dtype=torch.long)
+        obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding
+        sample     = [x_rpn, obj_tensor]
+
+        return sample 
 
     def __getitem__(self, idx):
         vid_info = self.samples[idx]
@@ -194,23 +215,13 @@ def __getitem__(self, idx):
         annot_dict = dict()
 
         if self.load_type == 'train': #Training input data is generated differently
-            # randomly sample 5 frames from 5 uniform intervals
-            T = x_rpn.size(1)
-            itv = T*1./self.num_frm
-            ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)]
-            x_rpn = x_rpn[:, ind, :]
+            #Generate postive example
+            pos_sample = self.sample_rpn_regions(x_rpn, idx)
 
-            obj_label = self.sample_obj_labels[idx]
-
-            #Generate positive example
-            obj_tensor = torch.tensor(obj_label, dtype=torch.long)
-            obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding
-            pos_sample = [x_rpn, obj_tensor]
-
-            #Sample negative example 
+            #Sample negative index 
             total_s = len(self.samples)
             neg_index = np.random.randint(total_s)
-            #Shouldn't include any overlapping object
+            #Shouldn't include any overlapping object in description
             while len(set(obj_label).intersection(set(self.sample_obj_labels[neg_index]))) != 0:
                 neg_index = np.random.randint(total_s)
 
@@ -243,17 +254,8 @@ def __getitem__(self, idx):
             x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals
             x_rpn = x_rpn[:, :, :self.num_proposals]
 
-            # randomly sample 5 frames from 5 uniform intervals
-            T = x_rpn.size(1)
-            itv = T*1./self.num_frm
-            ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)]
-            x_rpn = x_rpn[:, ind, :]
-
             #Generate negative example
-            neg_obj_label = self.sample_obj_labels[neg_index]
-            obj_tensor = torch.tensor(neg_obj_label, dtype=torch.long)
-            obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(neg_obj_label)).fill_(self.num_class))) #padding
-            neg_sample = [x_rpn, obj_tensor]
+            neg_sample = self.sample_rpn_regions(x_rpn, neg_index)
 
             output = [torch.stack(i) for i in zip(pos_sample, neg_sample)]
             output.append(self.load_type)
diff --git a/losses.py b/losses.py
index 2b2a3cb..ad3710e 100644
--- a/losses.py
+++ b/losses.py
@@ -113,10 +113,12 @@ def loss(self, predictions, data):
 
         return torch.mean(torch.sum(-one_hot * self.logsoftmax(predictions), dim=1))
 
+#Code source: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/train.py
 class YC2BB_Attention_Loss(object):
     def __init__(self, *args, **kwargs):
        """
-       Frame-wise attention loss used in:
+       Frame-wise attention loss used in Weakly-Supervised Object Video Grounding... 
+       https://arxiv.org/pdf/1805.02834.pdf
        
        Weakly-supervised, no groundtruth labels are used.
        """
@@ -130,10 +132,11 @@ def loss(self, predictions, data):
         """
         Args:
             predictions (List): 
-                - output (): 
-                - loss weighting():
-            data        (NoneType)
+                - output (Tensor, shape [2*T, 2]): Positive and negative attention weights for each sample
+                - loss_weigh (Tensor, shape [2*T, 1]): Loss weighting applied to each sampled frame
+            data        (None) 
 
+            T: number of sampled frames from video (default: 5)
         Return:
             Frame-wise weighting loss 
         """
diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py
index 2092345..702e020 100644
--- a/models/dvsa/dvsa.py
+++ b/models/dvsa/dvsa.py
@@ -1,3 +1,5 @@
+#Code heavily adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/model/dvsa.py
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -9,8 +11,14 @@
 from models.dvsa.dvsa_utils.transformer import Transformer
 
 class DVSA(nn.Module):
-
-#    def __init__(self, num_class, input_size=2048, enc_size=128, dropout=0.2, hidden_size=256, n_layers=1, n_heads=4, attn_drop=0.2, num_frm=5, has_loss_weighting=False):
+    """
+    Deep Visual-Semantic Alignments (DVSA). 
+    Implementation used as baseline in Weakly-Supervised Video Object Grounding...
+    Source: https://arxiv.org/pdf/1805.02834.pdf
+    
+    Original paper: Deep visual-semantic alignments for generating image descriptions
+    https://cs.stanford.edu/people/karpathy/cvpr2015.pdf
+    """
     def __init__(self, **kwargs):
         super().__init__()
         num_class          = kwargs['labels']
@@ -125,7 +133,7 @@ def output_attn(self, x_o, obj):
         x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous()
 
         N, C_out, T, num_proposals = x_o.size()
-        assert(N == 1) # two pos samples and one neg sample
+        assert(N == 1)
 
         # attention
         O = obj.size(1)
@@ -147,4 +155,3 @@ def _load_pretrained_weights(self):
         state_dict = torch.load('weights/yc2bb_full-model.pth', map_location=lambda storage, location: storage)
 
         self.load_state_dict(state_dict)
-
diff --git a/train.py b/train.py
index b68440e..8f44f57 100644
--- a/train.py
+++ b/train.py
@@ -187,7 +187,7 @@ def train(**args):
 
                 # END IF
 
-                if ((epoch*len(train_loader) + step+1) % 1 == 0):
+                if ((epoch*len(train_loader) + step+1) % 100 == 0):
                     print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/mini_batch_size))
 
                 # END IF
@@ -211,7 +211,7 @@ def train(**args):
     
 
             # END FOR: Epoch
-
+            
             scheduler.step(epoch=epoch)
             print('Schedulers lr: %f', scheduler.get_lr()[0])
 
@@ -257,11 +257,21 @@ def valid(valid_loader, running_acc, model, device, acc_metric):
     
     with torch.no_grad():
         for step, data in enumerate(valid_loader):
-            x_input     = data['data'].to(device)
+            x_input     = data['data']
             annotations = data['annots'] 
-            outputs     = model(x_input)
+
+            if isinstance(x_input, torch.Tensor):
+                outputs = model(x_input.to(device))
+            else:
+                for i, item in enumerate(x_input):
+                    if isinstance(item, torch.Tensor):
+                        x_input[i] = item.to(device)
+                outputs = model(*x_input)
         
             running_acc.append(acc_metric.get_accuracy(outputs, annotations))
+
+            if step % 100 == 0:
+                print('Step: {}/{} | validation acc: {:.4f}'.format(step, len(valid_loader), running_acc[-1]))
     
         # END FOR: Validation Accuracy
 

From b34b0eec63489428907bf2681212cc5187145434 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 9 Oct 2019 16:23:22 -0400
Subject: [PATCH 37/55] Added download links for weight and data files

---
 models/dvsa/config_test.yaml | 68 ++++++++++++++++++++++++++++++++++++
 weights/download_weights.sh  |  6 ++--
 2 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 models/dvsa/config_test.yaml

diff --git a/models/dvsa/config_test.yaml b/models/dvsa/config_test.yaml
new file mode 100644
index 0000000..49ad0f6
--- /dev/null
+++ b/models/dvsa/config_test.yaml
@@ -0,0 +1,68 @@
+# Preprocessing
+clip_length:       -1                    # Number of frames within a clip 
+clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
+clip_stride:       1                     # Frame offset between successive frames
+crop_shape:        [112,112]             # (Height, Width) of frame  
+crop_type:         Random                # Type of cropping operation (Random, Central and None)  
+final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
+num_clips:         1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
+random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
+resize_shape:      [128,171]             # (Height, Width) to resize original data 
+sample_duration:   16                    # Temporal size of video to be provided as input to the model 
+sample_size:       112                   # Height of frame to be provided as input to the model
+subtract_mean:     ''                    # Subtract mean (R,G,B) from all frames during preprocessing
+
+# Experiment Setup 
+acc_metric:        Box_Accuracy          # Accuracy metric 
+batch_size:        1                     # Numbers of videos in a mini-batch 
+dataset:           YC2BB                 # Name of dataset 
+debug:             1                     # If True, do not plot, save, or create data files 
+epoch:             30                    # Total number of epochs 
+exp:               exp                   # Experiment name
+gamma:             0.5                   # Multiplier with which to change learning rate
+grad_max_norm:     1                     # Norm for gradient clipping
+json_path:         /path/to/yc2bb         # Path to the json file for the given dataset
+labels:            67                    # Number of total classes in the dataset
+load_type:         test                  # Environment selection, to include only training/training and validation/testing dataset
+loss_type:         YC2BB_Attention_Loss  # Loss function
+lr:                0.05                  # Learning rate
+milestones:        [10, 20]              # Epoch values to change learning rate     
+model:             DVSA                  # Name of model to be loaded  
+momentum:          0.9                   # Momentum value in optimizer
+num_workers:       2                     # Number of CPU worker used to load data
+opt:               sgd                   # Name of optimizer
+preprocess:        default               # String argument to select preprocessing type
+pretrained:        1                     # Load pretrained network 
+pseudo_batch_loop: 1                     # Pseudo-batch size multiplier to mimic large minibatches 
+rerun:             1                     # Number of trials to repeat an experiment
+save_dir:          './results'           # Path to results directory
+seed:              999                   # Seed for reproducibility 
+weight_decay:      0.0005                # Weight decay
+
+# Dataset specific config
+yc2bb_class_file:               '/path/to/yc2bb/data/class_file.csv' #https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/data/class_file.csv 
+yc2bb_num_frm:                  5
+yc2bb_num_proposals:            20
+yc2bb_roi_pooled_feat_root:     '/path/to/yc2bb/data/yc2/roi_pooled_feat' #roi_pooled feat download links below
+#train: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_train.tar.gz (113 GB)
+#val: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_val.tar.gz (38 GB)
+#test: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_test.tar.gz (17 GB)
+yc2bb_rpn_proposal_root:        '/path/to/yc2bb/data/yc2/roi_box' #http://youcook2.eecs.umich.edu/static/dat/yc2_bb/all-box-100.tar.gz
+
+# Model specific config
+attn_drop:              0.2
+dropout:                0.2
+enc_size:               128
+has_loss_weighting:     1
+hidden_size:            256
+input_size:             2048
+loss_factor:            0.9
+n_heads:                4
+n_layers:               1
+#num_class:              67 #NOTE:redundant with labels
+obj_interact:           1
+ranking_margin:         0.1
+
+# Box accuracy config 
+accu_thresh:            0.5 
+fps:                    1
diff --git a/weights/download_weights.sh b/weights/download_weights.sh
index 3cbfec6..f5cc2d9 100755
--- a/weights/download_weights.sh
+++ b/weights/download_weights.sh
@@ -2,9 +2,6 @@
 
 #wget -O [saved_file_name] [direct_download_link]
 
-#GoTurn
-wget -O ./weights/goturn.pth.tar https://umich.box.com/shared/static/src6rfm4lpn0v3t4l26d6u0v4ixdwem5.tar
-
 #SSD
 wget -O ./weights/ssd300_mAP_77.43_v2.pkl https://umich.box.com/shared/static/jszcnnwcvscfyqe3o81xy8qzfbsc20vo.pkl
 
@@ -13,3 +10,6 @@ wget -O ./weights/c3d-pretrained.pth https://umich.box.com/shared/static/znmyt8u
 
 #C3D Mean
 wget -O ./weights/sport1m_train16_128_mean.npy https://umich.box.com/shared/static/ppbnldsa5rty615osdjh2yi8fqcx0a3b.npy 
+
+#YC2BB-Full model
+wget -O ./weights/yc2bb_full-model.pth https://umich.box.com/shared/static/5ukbdcawryzkkq4r789z0src6u6uvg3u.pth 

From caa5db8be8311576bf6ab2b9be3f5310092ad0b8 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 9 Oct 2019 16:29:13 -0400
Subject: [PATCH 38/55] Update README

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 3231468..d46f8a8 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,13 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
 |:--------------------:|:------------------:|:---------------------:|
 |        SSD300        |  VOC2007  |    76.58      |
+
+### Video Object Grounding
+|  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
+|:--------------------:|:------------------:|:---------------------:|
+|        DVSA (+fw, obj)        |  YC2-BB (Validation)  |    30.09      |
+fw: framewise weighting
+obj: object interaction
 ## Table of Contents
 
 * [Datasets](#configured-datasets)
@@ -38,12 +45,14 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php)                      | Video Object Detection |
 |[MSCOCO 2014](http://cocodataset.org/#download)                                                       | Object Detection, Keypoints|
 |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/)                                            | Object Detection, Classification|
+|[YC2-BB](http://youcook2.eecs.umich.edu/download)| Video Object Grounding|
 
 ## Models
 |                     Model                        |        Task(s)       |
 |:------------------------------------------------:|:--------------------:|
 |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition |
 |[SSD300](https://github.com/amdegroot/ssd.pytorch)                                             | Object Detection     |
+|[DVSA (+fw, obj)](https://github.com/MichiganCOG/Video-Grounding-from-Text)| Video Object Grounding|
 
 ## Requirements
 

From 3103540019adcdd93963ae8b18a8f95f58a67b51 Mon Sep 17 00:00:00 2001
From: natlouis <38472719+natlouis@users.noreply.github.com>
Date: Wed, 9 Oct 2019 20:06:50 -0400
Subject: [PATCH 39/55] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d46f8a8..359757b 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,8 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
 |:--------------------:|:------------------:|:---------------------:|
 |        DVSA (+fw, obj)        |  YC2-BB (Validation)  |    30.09      |
-fw: framewise weighting
-obj: object interaction
+
+**fw**: framewise weighting, **obj**: object interaction
 ## Table of Contents
 
 * [Datasets](#configured-datasets)

From bcdc19f49f80c830b74c4ec311d0f0ed6bb20816 Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Wed, 9 Oct 2019 20:55:01 -0400
Subject: [PATCH 40/55] small fix

---
 datasets/YC2BB.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
index 0840a06..10266ca 100644
--- a/datasets/YC2BB.py
+++ b/datasets/YC2BB.py
@@ -147,7 +147,7 @@ def __getitem__(self, idx):
                     bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] 
                 else:
                     if obj['occ'] or obj['outside']:
-                        bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] 
+                        bbox_data[trackid, frame_ind] = [-1, -1, -1, -1, -1] 
                     else:   
                         obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
 

From 19ba9db578648dbf98c50dde8c6963d826dca98c Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Thu, 10 Oct 2019 14:33:01 -0400
Subject: [PATCH 41/55] Add scaling parse args and description

---
 datasets/preprocessing_transforms.py | 1 +
 parse_args.py                        | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 78d0f4a..3bf74d0 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -677,6 +677,7 @@ class RandomZoomClip(PreprocTransform):
     """
     Random zoom on all frames in a clip. All frames receive same scaling
     Scale will be bounded by object bounding box (if given). Meaning, object will always be in view
+    If zooming out, the borders will be filled with black.
 
     >1: Zoom in
     <1: Zoom out
diff --git a/parse_args.py b/parse_args.py
index 8c90b31..d2d240d 100644
--- a/parse_args.py
+++ b/parse_args.py
@@ -47,6 +47,8 @@ def __init__(self):
         parser.add_argument('--crop_shape',   type=int, nargs=2,  help='(Height, Width) of frame') 
         parser.add_argument('--crop_type',    type=str, help='Type of cropping operation (Random, Center and None)')
         parser.add_argument('--num_clips',    type=int, help='Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips)')
+        parser.add_argument('--scale',        type=float, nargs=2, help='[min scale, max scale] amounts to randomly scale videos for augmentation purposes. scale >1 zooms in and scale <1 zooms out.  ')
+
 
         parser.add_argument('--debug',   type=int, help='Run an experiment but do not save any data or create any folders')
         parser.add_argument('--seed',    type=int, help='Seed for reproducibility')
@@ -76,7 +78,8 @@ def __init__(self):
             crop_type        = None,
             num_clips        = 1,
             debug            = 0,
-            seed             = 0)                       
+            seed             = 0,
+            scale            = [1,1])                       
 
 
 

From 009c1446de9e8cc94676acc3c5c79feb8cdce4eb Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 10 Oct 2019 14:55:06 -0400
Subject: [PATCH 42/55] update requirements

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 5fa42ed..48e72fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,3 +9,6 @@ tensorboardX==1.8
 scipy==1.3.0
 torch==1.1.0
 torchvision==0.3.0
+
+torchtext==0.2.1
+spacy==2.1.8 #Install 'en' package with: python -m spacy download en

From cc1aaa2ab607e955b581c014faa935f24c4f72cd Mon Sep 17 00:00:00 2001
From: Nathan Louis <natlouis@umich.edu>
Date: Thu, 10 Oct 2019 15:03:21 -0400
Subject: [PATCH 43/55] Auto-install spacy package

---
 install.sh       | 1 +
 requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index eda0fe7..c443541 100755
--- a/install.sh
+++ b/install.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
 pip3 install -r requirements.txt
+python -m spacy download en
 ./weights/download_weights.sh
diff --git a/requirements.txt b/requirements.txt
index 48e72fe..de36ea1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,4 @@ torch==1.1.0
 torchvision==0.3.0
 
 torchtext==0.2.1
-spacy==2.1.8 #Install 'en' package with: python -m spacy download en
+spacy==2.1.8

From 0ef84c31644bed792ca28686f53c3c51131a8711 Mon Sep 17 00:00:00 2001
From: Madan <madantrg@umich.edu>
Date: Tue, 22 Oct 2019 15:19:08 -0400
Subject: [PATCH 44/55] resolving issue of resetting validation accuracy by
 moving metric init into validation function

---
 train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/train.py b/train.py
index 7f66ef4..e2e72e8 100644
--- a/train.py
+++ b/train.py
@@ -126,7 +126,6 @@ def train(**args):
         # END IF
             
         model_loss = Losses(device=device, **args)
-        acc_metric = Metrics(**args)
         best_val_acc = 0.0
 
     ############################################################################################################################################################################
@@ -214,7 +213,7 @@ def train(**args):
 
             ## START FOR: Validation Accuracy
             running_acc = []
-            running_acc = valid(valid_loader, running_acc, model, device, acc_metric)
+            running_acc = valid(valid_loader, running_acc, model, device)
 
             if not args['debug']:
                 writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(train_loader) + step)
@@ -242,9 +241,10 @@ def train(**args):
             # Close Tensorboard Element
             writer.close()
 
-def valid(valid_loader, running_acc, model, device, acc_metric):
+def valid(valid_loader, running_acc, model, device):
+    acc_metric = Metrics(**args)
     model.eval()
-    
+
     with torch.no_grad():
         for step, data in enumerate(valid_loader):
             x_input     = data['data'].to(device)

From 901a7e0d8757ea493819b67252697f9ba0fd6f23 Mon Sep 17 00:00:00 2001
From: Madan <madantrg@umich.edu>
Date: Tue, 22 Oct 2019 15:33:44 -0400
Subject: [PATCH 45/55] Removing extra code pertaining to feature extraction

---
 eval.py                      | 23 +----------------------
 models/c3d/c3d.py            |  5 -----
 models/c3d/config_test.yaml  |  2 --
 models/c3d/config_train.yaml |  2 --
 4 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/eval.py b/eval.py
index 8b1a57f..0df5a15 100644
--- a/eval.py
+++ b/eval.py
@@ -91,28 +91,13 @@ def eval(**args):
     # Setup Model To Evaluate 
     model.eval()
 
-    ret_data   = None
-    ret_labels = None
-
     with torch.no_grad():
         for step, data in enumerate(eval_loader):
             x_input     = data['data'].to(device)
             annotations = data['annots']
 
             outputs = model(x_input)
-
-            if ret_data is None:
-                ret_data   = outputs.cpu().numpy()
-                ret_labels = annotations['labels'].cpu().numpy()[:, 0]
-
-            else:
-                ret_data   = np.vstack((ret_data, outputs.cpu().numpy()))
-                ret_labels = np.hstack((ret_labels, annotations['labels'].cpu().numpy()[:, 0]))
-
-            # END IF
-
-
-            acc = acc_metric.get_accuracy(outputs, annotations)
+            acc     = acc_metric.get_accuracy(outputs, annotations)
 
             if step % 100 == 0:
                 print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc))
@@ -120,12 +105,6 @@ def eval(**args):
     print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc))
 
     if not args['debug']:
-        ret_dict = {}
-        ret_dict['data']   = ret_data
-        ret_dict['labels'] = ret_labels
-        import scipy.io as sio
-        sio.savemat(os.path.join(result_dir,args['load_type']+'_'+args['dataset']+'.mat'), ret_dict)
-
         writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc)
         # Close Tensorboard Element
         writer.close()
diff --git a/models/c3d/c3d.py b/models/c3d/c3d.py
index dfd78fc..69e4a74 100644
--- a/models/c3d/c3d.py
+++ b/models/c3d/c3d.py
@@ -55,8 +55,6 @@ def __init__(self, **kwargs):
         if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']:
             self.__load_pretrained_weights()
 
-        self.features = kwargs['model_features']
-
     def forward(self, x, labels=False):
         x = self.relu(self.conv1(x))
         x = self.pool1(x)
@@ -80,9 +78,6 @@ def forward(self, x, labels=False):
 
         x = self.relu(self.fc6(x))
 
-        if self.features:
-            return x
-
         x = self.dropout(x)
         x = self.relu(self.fc7(x))
         x = self.dropout(x)
diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml
index 08f4e1c..d95e622 100644
--- a/models/c3d/config_test.yaml
+++ b/models/c3d/config_test.yaml
@@ -36,5 +36,3 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
-
-model_features:    0                     # 1 - return model features (before prediction), 0 - return model prediction output
diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml
index 65a0cb2..4456e0d 100644
--- a/models/c3d/config_train.yaml
+++ b/models/c3d/config_train.yaml
@@ -36,5 +36,3 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
-
-model_features:    0                     # 1 - return model features (before prediction), 0 - return model prediction output

From 2f26a0e1a2caa430b90cb53679efcc552628079c Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Thu, 24 Oct 2019 19:49:42 -0400
Subject: [PATCH 46/55] Add dataset loader and a test to load a video

---
 datasets/DHF1K.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 datasets/DHF1K.py

diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py
new file mode 100644
index 0000000..d186810
--- /dev/null
+++ b/datasets/DHF1K.py
@@ -0,0 +1,104 @@
+import torch
+try:
+    from .abstract_datasets import DetectionDataset 
+except:
+    from abstract_datasets import DetectionDataset 
+import cv2
+import os
+import numpy as np
+import json
+try:
+    import datasets.preprocessing_transforms as pt
+except:
+    import preprocessing_transforms as pt
+
+class DHF1K(DetectionDataset):
+    def __init__(self, *args, **kwargs):
+        super(DHF1K, self).__init__(*args, **kwargs)
+
+        # Get model object in case preprocessing other than default is used
+        self.model_object   = kwargs['model_obj']
+        self.load_type = kwargs['load_type']
+        
+        print(self.load_type)
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+        
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+    
+
+
+    
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+
+        
+        base_path = vid_info['base_path']
+        vid_size  = vid_info['frame_size']
+
+        input_data = []
+        map_data = []
+        bin_data = []
+
+        for frame_ind in range(len(vid_info['frames'])):
+            frame      = vid_info['frames'][frame_ind]
+            frame_path = frame['img_path']
+            map_path   = frame['map_path']
+            bin_path   = frame['bin_path']
+            
+            # Load frame, convert to RGB from BGR and normalize from 0 to 1
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]/255.)
+            
+            # Load frame, Normalize from 0 to 1
+            # All frame channels have repeated values
+            map_data.append(cv2.imread(map_path)/255.)
+            bin_data.append(cv2.imread(bin_path)/255.)
+
+
+
+        vid_data = self.transforms(input_data)
+
+        # Annotations must be resized in the loss/metric
+        map_data = torch.Tensor(map_data)
+        bin_data = torch.Tensor(bin_data)
+
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+        map_data = map_data.permute(3, 0, 1, 2)
+        bin_data = bin_data.permute(3, 0, 1, 2)
+        # All channels are repeated so remove the unnecessary channels
+        map_data = map_data[0].unsqueeze(0)
+        bin_data = bin_data[0].unsqueeze(0)
+
+
+        ret_dict         = dict() 
+        ret_dict['data'] = vid_data 
+
+        annot_dict                = dict()
+        annot_dict['map']         = map_data
+        annot_dict['bin']         = bin_data
+        annot_dict['input_shape'] = vid_data.size()
+        annot_dict['name']        = base_path
+        ret_dict['annots']        = annot_dict
+
+        return ret_dict
+
+
+if __name__=='__main__':
+
+    class tts():
+        def __call__(self, x):
+            return pt.ToTensorClip()(x)
+    class debug_model():
+        def __init__(self):
+            self.train_transforms = tts()
+    dataset = DHF1K(model_obj=debug_model(), json_path='/z/home/erichof/datasets/DHF1K', load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
+    train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False)
+
+
+    import matplotlib.pyplot as plt
+    for x in enumerate(train_loader):
+        plt.imshow(x[1]['data'][0,:,0].permute(1,2,0).numpy())
+        #plt.show()
+        import pdb; pdb.set_trace()

From 46cb1823dec95c3ba65d7497b0eed2b330141a34 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Thu, 24 Oct 2019 19:52:08 -0400
Subject: [PATCH 47/55] Add json generation script

---
 datasets/DHF1K.py                  |  3 +-
 datasets/scripts/gen_json_DHF1K.py | 74 ++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 datasets/scripts/gen_json_DHF1K.py

diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py
index d186810..467bb15 100644
--- a/datasets/DHF1K.py
+++ b/datasets/DHF1K.py
@@ -93,7 +93,8 @@ def __call__(self, x):
     class debug_model():
         def __init__(self):
             self.train_transforms = tts()
-    dataset = DHF1K(model_obj=debug_model(), json_path='/z/home/erichof/datasets/DHF1K', load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
+    json_path = '/path/to/json'
+    dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
     train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False)
 
 
diff --git a/datasets/scripts/gen_json_DHF1K.py b/datasets/scripts/gen_json_DHF1K.py
new file mode 100644
index 0000000..02c05eb
--- /dev/null
+++ b/datasets/scripts/gen_json_DHF1K.py
@@ -0,0 +1,74 @@
+import os
+import cv2
+import json
+
+
+def get_split(base_vid_path):
+    vids = os.listdir(base_vid_path)
+    vids = [int(vid) for vid in vids]
+    vids.sort()
+
+    # Out of the 1000 videos, the first 600 are annotated for training, 601-700 annotated for val, 701-1000 not annotated must be sent in to test
+    train_cutoff = 600
+    val_cutoff = 700
+    train_vids = vids[:vids.index(600)+1] 
+    val_vids = vids[vids.index(600)+1:vids.index(700)+1] 
+    test_vids = vids[vids.index(700)+1:]
+    
+    train_vids = [str(vid).zfill(3) for vid in train_vids]
+    test_vids  = [str(vid).zfill(3) for vid in test_vids]
+    val_vids   = [str(vid).zfill(3) for vid in val_vids]
+    annot_train_vids = [vid.zfill(4) for vid in train_vids]
+    annot_val_vids = [vid.zfill(4) for vid in val_vids]
+    return train_vids, test_vids, val_vids, annot_train_vids, annot_val_vids
+
+
+def save_json(load_type):
+    base_vid_path = '/path/to/DHF1K/video_png'
+    base_annot_path = '/path/to/DHF1K/annotation'
+    output_path = '/any/path/'
+   
+    train_vids, test_vids, val_vids, annot_train, annot_val = get_split(base_vid_path)
+    
+    if load_type == 'train':
+        tv_vids = train_vids
+        tv_ann = annot_train
+    elif load_type == 'val':
+        tv_vids = val_vids
+        tv_ann = annot_val
+
+    else:
+        tv_vids = test_vids
+        tv_ann = []
+
+    json_dat = [] 
+    for vid in sorted(tv_vids):
+        vid_dict = {}
+        frames = []
+        frame_size = []
+        for img in sorted(os.listdir(os.path.join(base_vid_path, vid))):
+            if frame_size == []:
+                frame_shape = cv2.imread(os.path.join(base_vid_path, vid, img)).shape
+                frame_size = [frame_shape[1], frame_shape[0]] # Width, Height
+            frame_dict = {}
+            frame_dict['img_path'] = img
+            if load_type != 'test':
+                frame_dict['map_path'] = os.path.join(base_annot_path, tv_ann[tv_vids.index(vid)], 'maps', img)
+                frame_dict['bin_path'] = os.path.join(base_annot_path, tv_ann[tv_vids.index(vid)], 'fixation', img)
+            else:
+                frame_dict['map_path'] = '' 
+                frame_dict['bin_path'] = ''
+
+            frames.append(frame_dict)
+        vid_dict['base_path'] = os.path.join(base_vid_path, vid)
+        vid_dict['frames'] = frames
+        vid_dict['frame_size'] = frame_size
+        json_dat.append(vid_dict)
+
+    writef = open(os.path.join(output_path,load_type+'.json'), 'w')
+    json.dump(json_dat, writef)
+    writef.close()
+
+save_json('train')
+save_json('val')
+save_json('test')

From d2d406994c08287a3e57c5f2764f8408a2300065 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Thu, 24 Oct 2019 20:27:13 -0400
Subject: [PATCH 48/55] Add i3d model and weights links and train/test scripts

---
 .gitignore                   |   1 +
 README.md                    |   2 +
 models/i3d/config_test.yaml  |  27 +++
 models/i3d/config_train.yaml |  37 +++
 models/i3d/i3d.py            | 447 +++++++++++++++++++++++++++++++++++
 weights/download_weights.sh  |   6 +
 6 files changed, 520 insertions(+)
 create mode 100644 models/i3d/config_test.yaml
 create mode 100644 models/i3d/config_train.yaml
 create mode 100644 models/i3d/i3d.py

diff --git a/.gitignore b/.gitignore
index c70c4d2..0e4172d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
+weights/*
diff --git a/README.md b/README.md
index 3231468..74bc01e 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 ### Recognition
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   |  
 |:--------------------:|:------------------:|:---------------------:|
+|        I3D           |  HMDB51 (Split 1)  |    72.75              |
 |        C3D           |  HMDB51 (Split 1)  |    50.14 ± 0.777      |
 |        C3D           |  UCF101 (Split 1)  |    80.40 ± 0.399      |
 
@@ -43,6 +44,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |                     Model                        |        Task(s)       |
 |:------------------------------------------------:|:--------------------:|
 |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition |
+|[I3D](https://github.com/piergiaj/pytorch-i3d) | Activity Recognition |
 |[SSD300](https://github.com/amdegroot/ssd.pytorch)                                             | Object Detection     |
 
 ## Requirements
diff --git a/models/i3d/config_test.yaml b/models/i3d/config_test.yaml
new file mode 100644
index 0000000..72ae825
--- /dev/null
+++ b/models/i3d/config_test.yaml
@@ -0,0 +1,27 @@
+# Preprocessing
+clip_length:       64                    # Number of frames within a clip 
+clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
+clip_stride:       0                     # Frame offset between successive frames
+crop_shape:        [224,224]             # (Height, Width) of frame  
+crop_type:         Center                # Type of cropping operation (Random, Central and None)  
+final_shape:       [224,224]             # (Height, Width) of input to be given to CNN
+num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
+random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
+resize_shape:      [230,250]             # (Height, Width) to resize original data 
+subtract_mean:     [123,117,104]                       # Subtract mean (R,G,B) from all frames during preprocessing
+
+# Experiment Setup 
+acc_metric:        Accuracy              # Accuracy metric 
+batch_size:        1                     # Numbers of videos in a mini-batch 
+dataset:           HMDB51                # Name of dataset 
+exp:               I3D            # Experiment name
+json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
+labels:            51                    # Number of total classes in the dataset
+load_type:         train_val                 # Environment selection, to include only training/training and validation/testing dataset
+model:             I3D                   # Name of model to be loaded  
+num_workers:       5                     # Number of CPU worker used to load data
+preprocess:        default               # String argument to select preprocessing type
+pretrained:        'weights/i3d_rgb_imagenet_then_HMDB51_30epochs.pkl'                     # Load pretrained network 
+save_dir:          './results'           # Path to results directory
+seed:              999                   # Seed for reproducibility 
+loss_type:         M_XENTROPY            # Loss function
diff --git a/models/i3d/config_train.yaml b/models/i3d/config_train.yaml
new file mode 100644
index 0000000..c15abe9
--- /dev/null
+++ b/models/i3d/config_train.yaml
@@ -0,0 +1,37 @@
+# Preprocessing
+clip_length:       64                    # Number of frames within a clip 
+clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
+clip_stride:       1                     # Frame offset between successive frames
+crop_shape:        [224,224]             # (Height, Width) of frame  
+crop_type:         Center                # Type of cropping operation (Random, Central and None)  
+final_shape:       [224,224]             # (Height, Width) of input to be given to CNN
+num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
+random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
+resize_shape:      [230,250]             # (Height, Width) to resize original data 
+subtract_mean:     [123,117,104]                       # Subtract mean (R,G,B) from all frames during preprocessing
+
+# Experiment Setup 
+acc_metric:        Accuracy              # Accuracy metric 
+batch_size:        5                     # Numbers of videos in a mini-batch 
+pseudo_batch_loop: 10                     # Pseudo-batch size multiplier to mimic large minibatches 
+dataset:           HMDB51                # Name of dataset 
+epoch:             30                    # Total number of epochs 
+exp:               I3D            # Experiment name
+gamma:             0.1                   # Multiplier with which to change learning rate
+json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
+labels:            51                    # Number of total classes in the dataset
+load_type:         train                 # Environment selection, to include only training/training and validation/testing dataset
+loss_type:         M_XENTROPY            # Loss function
+lr:                0.01                # Learning rate
+milestones:        [10, 20]              # Epoch values to change learning rate     
+model:             I3D                   # Name of model to be loaded  
+momentum:          0.9                   # Momentum value in optimizer
+num_workers:       5                     # Number of CPU worker used to load data
+opt:               sgd                   # Name of optimizer
+preprocess:        default               # String argument to select preprocessing type
+pretrained:        1                     # Load pretrained network 
+rerun:             1                     # Number of trials to repeat an experiment
+save_dir:          './results'           # Path to results directory
+seed:              999                   # Seed for reproducibility 
+weight_decay:      0.0005                # Weight decay
+grad_max_norm:     100
diff --git a/models/i3d/i3d.py b/models/i3d/i3d.py
new file mode 100644
index 0000000..4e901b3
--- /dev/null
+++ b/models/i3d/i3d.py
@@ -0,0 +1,447 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import datasets.preprocessing_transforms as pt 
+
+import numpy as np
+
+import os
+import sys
+from collections import OrderedDict
+
+
+"""
+Code from the implementation of i3d by AJ Piergiovanni: https://github.com/piergiaj/pytorch-i3d
+"""
+
+class MaxPool3dSamePadding(nn.MaxPool3d):
+    
+    def compute_pad(self, dim, s):
+        if s % self.stride[dim] == 0:
+            return max(self.kernel_size[dim] - self.stride[dim], 0)
+        else:
+            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
+
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        #print t,h,w
+        out_t = np.ceil(float(t) / float(self.stride[0]))
+        out_h = np.ceil(float(h) / float(self.stride[1]))
+        out_w = np.ceil(float(w) / float(self.stride[2]))
+        #print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        #print pad_t, pad_h, pad_w
+
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+        return super(MaxPool3dSamePadding, self).forward(x)
+    
+
+class Unit3D(nn.Module):
+
+    def __init__(self, in_channels,
+                 output_channels,
+                 kernel_shape=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 padding=0,
+                 activation_fn=F.relu,
+                 use_batch_norm=True,
+                 use_bias=False,
+                 name='unit_3d',
+                 dilation=1):
+        
+        """Initializes Unit3D module."""
+        super(Unit3D, self).__init__()
+        
+        self._output_channels = output_channels
+        self._kernel_shape = kernel_shape
+        self._stride = stride
+        self._use_batch_norm = use_batch_norm
+        self._activation_fn = activation_fn
+        self._use_bias = use_bias
+        self.name = name
+        self.padding = padding
+        
+        self.conv3d = nn.Conv3d(in_channels=in_channels,
+                                out_channels=self._output_channels,
+                                kernel_size=self._kernel_shape,
+                                stride=self._stride,
+                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
+                                bias=self._use_bias,
+                                dilation=dilation)
+        
+        if self._use_batch_norm:
+            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
+
+    def compute_pad(self, dim, s):
+        if s % self._stride[dim] == 0:
+            return max(self._kernel_shape[dim] - self._stride[dim], 0)
+        else:
+            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
+
+            
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        #print t,h,w
+        out_t = np.ceil(float(t) / float(self._stride[0]))
+        out_h = np.ceil(float(h) / float(self._stride[1]))
+        out_w = np.ceil(float(w) / float(self._stride[2]))
+        #print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        #print pad_t, pad_h, pad_w
+
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+
+        x = self.conv3d(x)
+        if self._use_batch_norm:
+            x = self.bn(x)
+        if self._activation_fn is not None:
+            x = self._activation_fn(x)
+        return x
+
+
+
+class InceptionModule(nn.Module):
+    def __init__(self, in_channels, out_channels, name):
+        super(InceptionModule, self).__init__()
+
+        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
+                         name=name+'/Branch_0/Conv3d_0a_1x1')
+        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_1/Conv3d_0a_1x1')
+        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_1/Conv3d_0b_3x3')
+        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_2/Conv3d_0a_1x1')
+        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_2/Conv3d_0b_3x3')
+        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
+                                stride=(1, 1, 1), padding=0)
+        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_3/Conv3d_0b_1x1')
+        self.name = name
+
+    def forward(self, x):    
+        b0 = self.b0(x)
+        b1 = self.b1b(self.b1a(x))
+        b2 = self.b2b(self.b2a(x))
+        b3 = self.b3b(self.b3a(x))
+        return torch.cat([b0,b1,b2,b3], dim=1)
+
+
+class I3D(nn.Module):
+    """Inception-v1 I3D architecture.
+    The model is introduced in:
+        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
+        Joao Carreira, Andrew Zisserman
+        https://arxiv.org/pdf/1705.07750v1.pdf.
+    See also the Inception architecture, introduced in:
+        Going deeper with convolutions
+        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+        http://arxiv.org/pdf/1409.4842v1.pdf.
+    """
+
+    # Endpoints of the model in order. During construction, all the endpoints up
+    # to a designated `final_endpoint` are returned in a dictionary as the
+    # second return value.
+    VALID_ENDPOINTS = (
+        'Conv3d_1a_7x7',
+        'MaxPool3d_2a_3x3',
+        'Conv3d_2b_1x1',
+        'Conv3d_2c_3x3',
+        'MaxPool3d_3a_3x3',
+        'Mixed_3b',
+        'Mixed_3c',
+        'MaxPool3d_4a_3x3',
+        'Mixed_4b',
+        'Mixed_4c',
+        'Mixed_4d',
+        'Mixed_4e',
+        'Mixed_4f',
+        'MaxPool3d_5a_2x2',
+        'Mixed_5b',
+        'Mixed_5c',
+        'Logits',
+        'Predictions',
+    )
+
+    def __init__(self, spatial_squeeze=True,
+                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5, **kwargs):
+        """Initializes I3D model instance.
+        Args:
+          num_classes: The number of outputs in the logit layer (default 400, which
+              matches the Kinetics dataset).
+          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
+              before returning (default True).
+          final_endpoint: The model contains many possible endpoints.
+              `final_endpoint` specifies the last endpoint for the model to be built
+              up to. In addition to the output at `final_endpoint`, all the outputs
+              at endpoints up to `final_endpoint` will also be returned, in a
+              dictionary. `final_endpoint` must be one of
+              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
+          name: A string (optional). The name of this module.
+        Raises:
+          ValueError: if `final_endpoint` is not recognized.
+        """
+
+        if final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+        super(I3D, self).__init__()
+        self._num_classes = kwargs['labels'] 
+        self._spatial_squeeze = spatial_squeeze
+        self._final_endpoint = final_endpoint
+        self.logits = None
+
+        self.train_transforms = PreprocessTrain(**kwargs)
+        self.test_transforms  = PreprocessEval(**kwargs)
+
+
+        if self._final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
+
+        self.end_points = {}
+        end_point = 'Conv3d_1a_7x7'
+        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
+                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'MaxPool3d_2a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'Conv3d_2b_1x1'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'Conv3d_2c_3x3'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'MaxPool3d_3a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'Mixed_3b'
+        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_3c'
+        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'MaxPool3d_4a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4b'
+        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4c'
+        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4d'
+        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4e'
+        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4f'
+        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'MaxPool3d_5a_2x2'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_5b'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_5c'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Logits'
+        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
+                                     stride=(1, 1, 1))
+        self.dropout = nn.Dropout(dropout_keep_prob)
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+
+        
+
+
+
+        self.build()
+
+        if 'pretrained' in kwargs.keys() and kwargs['pretrained']:
+            if 'i3d_pretrained' in kwargs.keys():
+                self._load_checkpoint(kwargs['i3d_pretrained'])
+            else:
+                self._load_pretrained_weights() 
+
+    def _load_pretrained_weights(self):
+        p_dict = torch.load('weights/i3d_rgb_imagenet.pt')
+        s_dict = self.state_dict()
+        for name in p_dict:
+            if name in s_dict.keys():
+                if p_dict[name].shape == s_dict[name].shape:
+                    s_dict[name] = p_dict[name]
+
+        self.load_state_dict(s_dict)
+
+    def _load_checkpoint(self, saved_weights):
+        p_dict = torch.load(saved_weights)['state_dict']
+        s_dict = self.state_dict()
+        for name in p_dict:
+            if name in s_dict.keys():
+                if p_dict[name].shape == s_dict[name].shape:
+                    s_dict[name] = p_dict[name]
+
+        self.load_state_dict(s_dict)
+
+
+
+    def replace_logits(self, num_classes):
+        self._num_classes = num_classes
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+        
+    
+    def build(self):
+        for k in self.end_points.keys():
+            self.add_module(k, self.end_points[k])
+        
+    def forward(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x) # use _modules to work with dataparallel
+
+        x = self.logits(self.dropout(self.avg_pool(x)))
+
+        if self._spatial_squeeze:
+            logits = x.squeeze(3).squeeze(3)
+        # logits is batch X classes X time, which is what we want to work with
+
+        logits = torch.mean(logits, dim=2)
+        return logits
+        
+
+    def extract_features(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x)
+        return self.avg_pool(x)
+
+class PreprocessTrain(object):
+    """
+    Container for all transforms used to preprocess clips for training in this dataset.
+    """
+    def __init__(self, **kwargs):
+        """
+        Initialize preprocessing class for training set
+        Args:
+            preprocess (String): Keyword to select different preprocessing types            
+            crop_type  (String): Select random or central crop 
+
+        Return:
+            None
+        """
+
+        self.transforms  = []
+        self.transforms1 = []
+        self.preprocess  = kwargs['preprocess']
+        crop_type        = kwargs['crop_type']
+
+
+        self.transforms.append(pt.ResizeClip(**kwargs))
+        
+        if crop_type == 'Random':
+            self.transforms.append(pt.RandomCropClip(**kwargs))
+
+        else:
+            self.transforms.append(pt.CenterCropClip(**kwargs))
+
+        self.transforms.append(pt.SubtractRGBMean(**kwargs))
+        self.transforms.append(pt.RandomFlipClip(direction='h', p=0.5, **kwargs))
+        self.transforms.append(pt.ToTensorClip(**kwargs))
+
+    def __call__(self, input_data):
+        for transform in self.transforms:
+            input_data = transform(input_data)
+
+        return input_data
+
+
+class PreprocessEval(object):
+    """
+    Container for all transforms used to preprocess clips for training in this dataset.
+    """
+    def __init__(self, **kwargs):
+        """
+        Initialize preprocessing class for training set
+        Args:
+            preprocess (String): Keyword to select different preprocessing types            
+            crop_type  (String): Select random or central crop 
+
+        Return:
+            None
+        """
+
+        self.transforms = []
+
+        self.transforms.append(pt.ResizeClip(**kwargs))
+        self.transforms.append(pt.CenterCropClip(**kwargs))
+        self.transforms.append(pt.SubtractRGBMean(**kwargs))
+        self.transforms.append(pt.ToTensorClip(**kwargs))
+
+
+    def __call__(self, input_data):
+        for transform in self.transforms:
+            input_data = transform(input_data)
+
+        return input_data
diff --git a/weights/download_weights.sh b/weights/download_weights.sh
index 3cbfec6..aef001d 100755
--- a/weights/download_weights.sh
+++ b/weights/download_weights.sh
@@ -13,3 +13,9 @@ wget -O ./weights/c3d-pretrained.pth https://umich.box.com/shared/static/znmyt8u
 
 #C3D Mean
 wget -O ./weights/sport1m_train16_128_mean.npy https://umich.box.com/shared/static/ppbnldsa5rty615osdjh2yi8fqcx0a3b.npy 
+
+#I3D pretrained on ImageNet and then Kinetics by original authors
+wget -O ./weights/i3d_rgb_imagenet.pt https://umich.box.com/shared/static/5m6dwwepzdcw3kjhx7s0peb59lbcde0s.pt
+
+#I3D pretrained on ImageNet, Kinetics, then on HMDB51 in ViP
+wget -O ./weights/i3d_rgb_imagenet_then_HMDB51_30epochs.pkl https://umich.box.com/shared/static/x8x83sw4htidxsxgtus9nt00f383mmm7.pkl

From 58b32a7e66c15a5cbd8acf128aebcbae252c63a2 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Thu, 24 Oct 2019 20:30:11 -0400
Subject: [PATCH 49/55] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 3231468..fb1661f 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php)                      | Video Object Detection |
 |[MSCOCO 2014](http://cocodataset.org/#download)                                                       | Object Detection, Keypoints|
 |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/)                                            | Object Detection, Classification|
+|[DHF1K](https://github.com/wenguanwang/DHF1K)							       | Video Saliency Prediction|
 
 ## Models
 |                     Model                        |        Task(s)       |

From f017c00da6d24798803a46aebc450b04c5196cee Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 25 Oct 2019 08:38:33 -0400
Subject: [PATCH 50/55] Complete DHF1K test

---
 .gitignore        |  1 +
 datasets/DHF1K.py | 14 +++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index c70c4d2..0e4172d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
+weights/*
diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py
index 467bb15..01e05f7 100644
--- a/datasets/DHF1K.py
+++ b/datasets/DHF1K.py
@@ -93,13 +93,21 @@ def __call__(self, x):
     class debug_model():
         def __init__(self):
             self.train_transforms = tts()
-    json_path = '/path/to/json'
+
+
+    json_path = '/path/to/DHF1K' #### Change this when testing ####
+
+
     dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
     train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False)
 
 
     import matplotlib.pyplot as plt
     for x in enumerate(train_loader):
-        plt.imshow(x[1]['data'][0,:,0].permute(1,2,0).numpy())
-        #plt.show()
+        dat = x[1]['data'][0,:,0].permute(1,2,0).numpy()
+        bin = x[1]['annots']['bin'][0,:,0].permute(1,2,0).numpy().repeat(3,axis=2)
+        map = x[1]['annots']['map'][0,:,0].permute(1,2,0).numpy().repeat(3, axis=2)
+        img = np.concatenate([dat,bin,map], axis=0)
+        plt.imshow(img)
+        plt.show()
         import pdb; pdb.set_trace()

From 92ad95b3c9cdc64b267e3ee736921dbdacd04dcf Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 25 Oct 2019 08:52:06 -0400
Subject: [PATCH 51/55] Update README.md

Add citation and link to arXiv paper
---
 README.md | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3231468..861885a 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Video Platform for Recognition and Detection in Pytorch
+# [Video Platform for Recognition and Detection in Pytorch](https://arxiv.org/abs/1910.02793)
 
 A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD.
 
@@ -16,6 +16,22 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
 |:--------------------:|:------------------:|:---------------------:|
 |        SSD300        |  VOC2007  |    76.58      |
+
+## Citation
+
+Please cite ViP when releasing any work that used this platform: https://arxiv.org/abs/1910.02793
+
+```
+@article{ganesh2019vip,
+  title={ViP: Video Platform for PyTorch},
+  author={Ganesh, Madan Ravi and Hofesmann, Eric and Louis, Nathan and Corso, Jason},
+  journal={arXiv preprint arXiv:1910.02793},
+  year={2019}
+}
+
+```
+
+
 ## Table of Contents
 
 * [Datasets](#configured-datasets)

From d9b926af18f24d521bf579b726eb989d4556f2ae Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 25 Oct 2019 16:13:22 -0400
Subject: [PATCH 52/55] Ignore .pt files

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index c70c4d2..5dbdb9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
+*.pt

From ba7c110fb399474505ce7160d54f707c8ceddced Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 25 Oct 2019 16:14:31 -0400
Subject: [PATCH 53/55] Ignore .pt files

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0e4172d..5dbdb9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
-weights/*
+*.pt

From 8e91e396307cdcb9bc6e9661aed0bf9dabb75442 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 25 Oct 2019 16:16:14 -0400
Subject: [PATCH 54/55] remove .pt

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0e4172d..5dbdb9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
-weights/*
+*.pt

From 0870ee5da8f84df8c6e32ba4279bf5f4db66b5c0 Mon Sep 17 00:00:00 2001
From: Eric Hofesmann <erichof@umich.edu>
Date: Fri, 25 Oct 2019 16:42:38 -0400
Subject: [PATCH 55/55] Update requirements.txt

Newer version of pillow
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5fa42ed..519b704 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 numpy==1.17.0
 opencv-python==4.1.0.25
-Pillow==6.1.0
+pillow>=6.2.0
 protobuf==3.9.0
 PyYAML==5.1.1
 scipy==1.3.0