diff --git a/.gitignore b/.gitignore
index c70c4d2..5dbdb9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ runs/*
 models/HGC3D
 *.json
 pbs/*
+*.pt
diff --git a/README.md b/README.md
index 3231468..6b762ed 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Video Platform for Recognition and Detection in Pytorch
+# [Video Platform for Recognition and Detection in Pytorch](https://arxiv.org/abs/1910.02793)
 
 A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD.
 
@@ -9,6 +9,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 ### Recognition
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   |  
 |:--------------------:|:------------------:|:---------------------:|
+|        I3D           |  HMDB51 (Split 1)  |    72.75              |
 |        C3D           |  HMDB51 (Split 1)  |    50.14 ± 0.777      |
 |        C3D           |  UCF101 (Split 1)  |    80.40 ± 0.399      |
 
@@ -16,6 +17,31 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
 |:--------------------:|:------------------:|:---------------------:|
 |        SSD300        |  VOC2007  |    76.58      |
+
+### Video Object Grounding
+|  Model Architecture  |      Dataset       |    ViP Accuracy (%)   | 
+|:--------------------:|:------------------:|:---------------------:|
+|        DVSA (+fw, obj)        |  YC2-BB (Validation)  |    30.09      |
+
+**fw**: framewise weighting, **obj**: object interaction
+
+
+## Citation
+
+Please cite ViP when releasing any work that used this platform: https://arxiv.org/abs/1910.02793
+
+```
+@article{ganesh2019vip,
+  title={ViP: Video Platform for PyTorch},
+  author={Ganesh, Madan Ravi and Hofesmann, Eric and Louis, Nathan and Corso, Jason},
+  journal={arXiv preprint arXiv:1910.02793},
+  year={2019}
+}
+
+```
+
+
+
 ## Table of Contents
 
 * [Datasets](#configured-datasets)
@@ -38,12 +64,16 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki)
 |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php)                      | Video Object Detection |
 |[MSCOCO 2014](http://cocodataset.org/#download)                                                       | Object Detection, Keypoints|
 |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/)                                            | Object Detection, Classification|
+|[YC2-BB](http://youcook2.eecs.umich.edu/download)| Video Object Grounding|
+|[DHF1K](https://github.com/wenguanwang/DHF1K)							       | Video Saliency Prediction|
 
 ## Models
 |                     Model                        |        Task(s)       |
 |:------------------------------------------------:|:--------------------:|
 |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition |
+|[I3D](https://github.com/piergiaj/pytorch-i3d) | Activity Recognition |
 |[SSD300](https://github.com/amdegroot/ssd.pytorch)                                             | Object Detection     |
+|[DVSA (+fw, obj)](https://github.com/MichiganCOG/Video-Grounding-from-Text)| Video Object Grounding|
 
 ## Requirements
 
diff --git a/config_default_example.yaml b/config_default_example.yaml
index 6094268..a523504 100644
--- a/config_default_example.yaml
+++ b/config_default_example.yaml
@@ -1,25 +1,24 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive clips, must be >= 1 
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
 num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
 random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
 resize_shape:      [128,171]             # (Height, Width) to resize original data 
-sample_duration:   16                    # Temporal size of video to be provided as input to the model 
-sample_size:       112                   # Height of frame to be provided as input to the model
 subtract_mean:     ''                    # Subtract mean (R,G,B) from all frames during preprocessing
 
 # Experiment Setup 
 acc_metric:        Accuracy              # Accuracy metric 
-batch_size:        3                     # Numbers of videos in a mini-batch 
+batch_size:        15                     # Numbers of videos in a mini-batch 
 dataset:           HMDB51                # Name of dataset 
 debug:             0                     # If True, do not plot, save, or create data files 
 epoch:             30                    # Total number of epochs 
 exp:               exp                   # Experiment name
 gamma:             0.1                   # Multiplier with which to change learning rate
+grad_max_norm:     0                     # Norm for gradient clipping
 json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
 labels:            51                    # Number of total classes in the dataset
 load_type:         train                 # Environment selection, to include only training/training and validation/testing dataset
@@ -37,3 +36,4 @@ rerun:             1                     # Number of trials to repeat an experim
 save_dir:          './results'           # Path to results directory
 seed:              999                   # Seed for reproducibility 
 weight_decay:      0.0005                # Weight decay
+resume:            0                     # Flag to resume training or switch to alternate objective after loading
diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py
new file mode 100644
index 0000000..01e05f7
--- /dev/null
+++ b/datasets/DHF1K.py
@@ -0,0 +1,113 @@
+import torch
+try:
+    from .abstract_datasets import DetectionDataset 
+except:
+    from abstract_datasets import DetectionDataset 
+import cv2
+import os
+import numpy as np
+import json
+try:
+    import datasets.preprocessing_transforms as pt
+except:
+    import preprocessing_transforms as pt
+
+class DHF1K(DetectionDataset):
+    def __init__(self, *args, **kwargs):
+        super(DHF1K, self).__init__(*args, **kwargs)
+
+        # Get model object in case preprocessing other than default is used
+        self.model_object   = kwargs['model_obj']
+        self.load_type = kwargs['load_type']
+        
+        print(self.load_type)
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+        
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+    
+
+
+    
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+
+        
+        base_path = vid_info['base_path']
+        vid_size  = vid_info['frame_size']
+
+        input_data = []
+        map_data = []
+        bin_data = []
+
+        for frame_ind in range(len(vid_info['frames'])):
+            frame      = vid_info['frames'][frame_ind]
+            frame_path = frame['img_path']
+            map_path   = frame['map_path']
+            bin_path   = frame['bin_path']
+            
+            # Load frame, convert to RGB from BGR and normalize from 0 to 1
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]/255.)
+            
+            # Load frame, Normalize from 0 to 1
+            # All frame channels have repeated values
+            map_data.append(cv2.imread(map_path)/255.)
+            bin_data.append(cv2.imread(bin_path)/255.)
+
+
+
+        vid_data = self.transforms(input_data)
+
+        # Annotations must be resized in the loss/metric
+        map_data = torch.Tensor(map_data)
+        bin_data = torch.Tensor(bin_data)
+
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+        map_data = map_data.permute(3, 0, 1, 2)
+        bin_data = bin_data.permute(3, 0, 1, 2)
+        # All channels are repeated so remove the unnecessary channels
+        map_data = map_data[0].unsqueeze(0)
+        bin_data = bin_data[0].unsqueeze(0)
+
+
+        ret_dict         = dict() 
+        ret_dict['data'] = vid_data 
+
+        annot_dict                = dict()
+        annot_dict['map']         = map_data
+        annot_dict['bin']         = bin_data
+        annot_dict['input_shape'] = vid_data.size()
+        annot_dict['name']        = base_path
+        ret_dict['annots']        = annot_dict
+
+        return ret_dict
+
+
+if __name__=='__main__':
+
+    class tts():
+        def __call__(self, x):
+            return pt.ToTensorClip()(x)
+    class debug_model():
+        def __init__(self):
+            self.train_transforms = tts()
+
+
+    json_path = '/path/to/DHF1K' #### Change this when testing ####
+
+
+    dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1)
+    train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False)
+
+
+    import matplotlib.pyplot as plt
+    for x in enumerate(train_loader):
+        dat = x[1]['data'][0,:,0].permute(1,2,0).numpy()
+        bin = x[1]['annots']['bin'][0,:,0].permute(1,2,0).numpy().repeat(3,axis=2)
+        map = x[1]['annots']['map'][0,:,0].permute(1,2,0).numpy().repeat(3, axis=2)
+        img = np.concatenate([dat,bin,map], axis=0)
+        plt.imshow(img)
+        plt.show()
+        import pdb; pdb.set_trace()
diff --git a/datasets/HMDB51.py b/datasets/HMDB51.py
index 3d93bc3..6eb83a9 100644
--- a/datasets/HMDB51.py
+++ b/datasets/HMDB51.py
@@ -40,8 +40,9 @@ def __getitem__(self, idx):
         base_path = vid_info['base_path']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        labels     = np.zeros((self.clip_length))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
         input_data = []
     
         for frame_ind in range(len(vid_info['frames'])):
diff --git a/datasets/ImageNetVID.py b/datasets/ImageNetVID.py
index 1965c8c..ea180dc 100644
--- a/datasets/ImageNetVID.py
+++ b/datasets/ImageNetVID.py
@@ -42,10 +42,12 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data  = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels     = np.zeros((self.clip_length, self.max_objects))-1
-        occlusions = np.zeros((self.clip_length, self.max_objects))-1
+
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data  = np.zeros((vid_length, self.max_objects, 4))-1
+        labels     = np.zeros((vid_length, self.max_objects))-1
+        occlusions = np.zeros((vid_length, self.max_objects))-1
 
 
 
diff --git a/datasets/KTH.py b/datasets/KTH.py
new file mode 100644
index 0000000..41c14a0
--- /dev/null
+++ b/datasets/KTH.py
@@ -0,0 +1,79 @@
+import torch
+from .abstract_datasets import RecognitionDataset 
+from PIL import Image
+import cv2
+import os
+import numpy as np
+from torchvision import transforms
+
+class KTH(RecognitionDataset):
+    def __init__(self, *args, **kwargs):
+        """
+        Initialize KTH class  
+        Args:
+            load_type    (String): Select training or testing set 
+            resize_shape (Int):    [Int, Int] Array indicating desired height and width to resize input
+            crop_shape   (Int):    [Int, Int] Array indicating desired height and width to crop input
+            final_shape  (Int):    [Int, Int] Array indicating desired height and width of input to deep network
+            preprocess   (String): Keyword to select different preprocessing types            
+
+        Return:
+            None
+        """
+        super(KTH, self).__init__(*args, **kwargs)
+
+        self.load_type    = kwargs['load_type']
+        self.resize_shape = kwargs['resize_shape']
+        self.crop_shape   = kwargs['crop_shape']
+        self.final_shape  = kwargs['final_shape']
+        self.preprocess   = kwargs['preprocess']
+        
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+
+
+    def __getitem__(self, idx):
+        vid_info  = self.samples[idx]
+        base_path = vid_info['base_path']
+
+        input_data = []
+
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
+        input_data = []
+    
+        for frame_ind in range(len(vid_info['frames'])):
+            frame_path   = os.path.join(base_path, vid_info['frames'][frame_ind]['img_path'])
+
+            for frame_labels in vid_info['frames'][frame_ind]['actions']:
+                labels[frame_ind] = frame_labels['action_class']
+
+            # Load frame image data and preprocess image accordingly
+            input_data.append(cv2.imread(frame_path)[...,::-1]/1.)
+
+
+        # Preprocess data
+        vid_data   = self.transforms(input_data)
+        labels     = torch.from_numpy(labels).float()
+
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+
+        ret_dict           = dict() 
+        ret_dict['data']   = vid_data 
+
+        annot_dict           = dict()
+        annot_dict['labels'] = labels
+
+        ret_dict['annots']   = annot_dict
+
+        return ret_dict
+
+
+#dataset = HMDB51(json_path='/z/dat/HMDB51', dataset_type='train', clip_length=100, num_clips=0)
+#dat = dataset.__getitem__(0)
+#import pdb; pdb.set_trace()
diff --git a/datasets/MSCOCO.py b/datasets/MSCOCO.py
index 5d9868f..7b6f6d4 100644
--- a/datasets/MSCOCO.py
+++ b/datasets/MSCOCO.py
@@ -1,6 +1,6 @@
 import torch
 from .abstract_datasets import DetectionDataset 
-from PIL import Image
+import cv2
 import os
 import numpy as np
 import datasets.preprocessing_transforms as pt
@@ -34,10 +34,11 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data  = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels     = np.zeros((self.clip_length, self.max_objects))-1
-        iscrowds   = np.zeros((self.clip_length, self.max_objects))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data  = np.zeros((vid_length, self.max_objects, 4))-1
+        labels     = np.zeros((vid_length, self.max_objects))-1
+        iscrowds   = np.zeros((vid_length, self.max_objects))-1
 
 
 
@@ -62,7 +63,7 @@ def __getitem__(self, idx):
                 iscrowds[frame_ind, trackid]     = iscrowd
 
 
-            input_data.append(Image.open(os.path.join(base_path, frame_path)))
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1])
 
         vid_data, bbox_data = self.transforms(input_data, bbox_data)
 
diff --git a/datasets/Manual_Hands.py b/datasets/Manual_Hands.py
new file mode 100644
index 0000000..21916ef
--- /dev/null
+++ b/datasets/Manual_Hands.py
@@ -0,0 +1,127 @@
+import torch
+import torchvision
+from .abstract_datasets import DetectionDataset 
+import cv2
+import os
+import numpy as np
+import json
+
+class Manual_Hands(DetectionDataset):
+    """
+    Manually-annotated keypoints on hands for pose estimation.
+    Includes images from The MPII Human Pose and New Zealand Sign Language (NZSL) datasets
+
+    Source: https://arxiv.org/1704.07809
+    """
+    def __init__(self, *args, **kwargs):
+        super(Manual_Hands, self).__init__(*args, **kwargs)
+
+        self.load_type = kwargs['load_type']
+        self.json_path = kwargs['json_path']
+
+        # Maximum number of annotated object present in a single frame in entire dataset
+        # Dictates the return size of annotations in __getitem__
+        self.max_objects = 1
+        self.sigma       = 3.0
+        self.stride      = 8 #effective stride of the entire network
+
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+
+    #Adapted from: https://github.com/namedBen/Convolutional-Pose-Machines-Pytorch
+    def gaussian_kernel(self, size_w, size_h, center_x, center_y, sigma):
+        #Outputs a gaussian heat map on defined point
+        gridy, gridx = torch.meshgrid(torch.arange(0,size_h,dtype=torch.float), torch.arange(0,size_w,dtype=torch.float))
+        D2 = (gridx - center_x)**2 + (gridy - center_y)**2
+
+        return torch.exp(-0.5 * D2 / sigma**2)
+
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+        
+        base_path = vid_info['base_path']
+        vid_size  = vid_info['frame_size']
+
+        input_data    = []
+
+        vid_length = len(vid_info['frames'])
+        vid_data      = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data     = np.zeros((vid_length, self.max_objects, 4))-1
+        hand_pts_data = np.zeros((vid_length, self.max_objects, 21, 3))-1
+        labels        = np.zeros((vid_length, self.max_objects))-1
+        occlusions    = np.zeros((vid_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points
+
+        for frame_ind in range(len(vid_info['frames'])):
+            frame          = vid_info['frames'][frame_ind]
+            width, height  = vid_info['frame_size']
+            frame_path     = frame['img_path']
+            
+            # Extract bbox and label data from video info
+            for obj in frame['objs']:
+                #trackid   = obj['trackid'] #Let's ignore trackid for now, only one annotation per image
+                trackid   = 0
+                label     = 1 if obj['c'] == 'left' else 0 #1: left hand, 0: right hand
+                occluded  = obj['occ']
+                obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
+                body_pts  = obj['body_pts'] #16 points (x,y,valid)
+                hand_pts  = obj['hand_pts'] #21 points (x,y,valid)
+                head_box  = obj['head_box']
+                head_size = obj['head_size'] #max dim of tightest box around head
+                hand_ctr  = obj['hand_ctr']
+                mpii      = obj['mpii']
+
+                #During training square patch is 2.2*B where B is max(obj_bbox)
+                if self.load_type == 'train':
+                    B = max(obj_bbox[2]-obj_bbox[0], obj_bbox[3]-obj_bbox[1])
+                else: #During testing B is 0.7*head_size
+                    B = 0.7*head_size
+
+                hand_size = 2.2*B
+                xtl       = np.clip(int(hand_ctr[0]-hand_size/2), 0, width)
+                ytl       = np.clip(int(hand_ctr[1]-hand_size/2), 0, height)
+                xbr       = np.clip(int(hand_ctr[0]+hand_size/2), 0, width)
+                ybr       = np.clip(int(hand_ctr[1]+hand_size/2), 0, height)
+
+                hand_crop = [xtl, ytl, xbr, ybr]
+                bbox_data[frame_ind, trackid, :]     = obj_bbox
+                labels[frame_ind, trackid]           = label 
+                hand_pts_data[frame_ind, trackid, :] = hand_pts
+                occlusions[frame_ind, trackid]       = occluded + [0] #Add element for background
+
+            # Load frame, convert to RGB from BGR and normalize from 0 to 1
+            input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1])
+
+        #Crop hand and resize, perform same transforms to ground truth keypoints
+        vid_data, hand_pts_coords = self.transforms(input_data, hand_pts_data[:,:,:,:2], hand_crop, labels)
+
+        h_width  = int(self.final_shape[1]/self.stride)
+        h_height = int(self.final_shape[0]/self.stride)
+        heatmaps = torch.zeros((22, h_width, h_height), dtype=torch.float) #heatmaps for 21 keypoints + background
+        for i,pts in enumerate(hand_pts_coords[0][0]):
+            x = pts[0] / self.stride
+            y = pts[1] / self.stride 
+            heatmaps[i,:,:] = self.gaussian_kernel(h_width, h_height, x, y, self.sigma)
+
+        heatmaps[-1,:,:] = 1 - torch.max(heatmaps[:-1,:,:], dim=0)[0] #Last layer is background
+
+        vid_data = vid_data/255
+        # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) 
+        vid_data = vid_data.permute(3, 0, 1, 2)
+        vid_data = vid_data.squeeze(1) #Remove frame dimension, b/c this is an image dataset
+
+        ret_dict = dict() 
+        ret_dict['data']       = vid_data 
+        annot_dict = dict()
+        annot_dict['head_size']    = head_size
+        annot_dict['hand_pts']    = hand_pts_coords 
+        annot_dict['heatmaps']    = heatmaps
+        annot_dict['labels']      = labels
+        annot_dict['occ']         = occlusions 
+        annot_dict['frame_path']  = frame_path 
+        annot_dict['frame_size']  = vid_size #width, height
+        ret_dict['annots']     = annot_dict
+
+        return ret_dict
diff --git a/datasets/UCF101.py b/datasets/UCF101.py
index 28ef78d..40c8b58 100644
--- a/datasets/UCF101.py
+++ b/datasets/UCF101.py
@@ -41,8 +41,9 @@ def __getitem__(self, idx):
         base_path = vid_info['base_path']
 
         input_data = []
-        vid_data   = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        labels     = np.zeros((self.clip_length))-1
+        vid_length = len(vid_info['frames'])
+        vid_data   = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        labels     = np.zeros((vid_length))-1
         input_data = []
     
         for frame_ind in range(len(vid_info['frames'])):
diff --git a/datasets/VOC2007.py b/datasets/VOC2007.py
index d4764d8..4be3f30 100644
--- a/datasets/VOC2007.py
+++ b/datasets/VOC2007.py
@@ -1,6 +1,5 @@
 import torch
 from .abstract_datasets import DetectionDataset 
-from PIL import Image
 import cv2
 import os
 import numpy as np
@@ -53,10 +52,11 @@ def __getitem__(self, idx):
         vid_size  = vid_info['frame_size']
 
         input_data  = []
-        vid_data    = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1
-        bbox_data   = np.zeros((self.clip_length, self.max_objects, 4))-1
-        labels      = np.zeros((self.clip_length, self.max_objects))-1
-        diff_labels = np.zeros((self.clip_length, self.max_objects)) #difficult object labels
+        vid_length = len(vid_info['frames'])
+        vid_data    = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1
+        bbox_data   = np.zeros((vid_length, self.max_objects, 4))-1
+        labels      = np.zeros((vid_length, self.max_objects))-1
+        diff_labels = np.zeros((vid_length, self.max_objects)) #difficult object labels
 
         for frame_ind in range(len(vid_info['frames'])):
             frame      = vid_info['frames'][frame_ind]
diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py
new file mode 100644
index 0000000..10266ca
--- /dev/null
+++ b/datasets/YC2BB.py
@@ -0,0 +1,315 @@
+#Adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text
+import torch
+from .abstract_datasets import DetectionDataset 
+from PIL import Image
+import cv2
+import os
+import csv
+import numpy as np
+
+import torchtext
+
+class YC2BB(DetectionDataset):
+    '''
+    YouCook2-Bounding Boxes dataset. Introduced in weakly-supervised video object grounding task
+    Paper: https://arxiv.org/pdf/1805.02834.pdf
+
+    training: no bounding box annotations, only sentence describing sentence
+    validation: bounding box annotations and grounded words available
+    testing: bounding box annotations not publicly available, only grounded words
+    '''
+    def __init__(self, *args, **kwargs):
+        super(YC2BB, self).__init__(*args, **kwargs)
+        
+        #Define the following configuration parameters in your config_*.yaml file
+        #Or as a system arg
+        class_file           = kwargs['yc2bb_class_file']
+        num_proposals        = kwargs['yc2bb_num_proposals']
+        rpn_proposal_root    = kwargs['yc2bb_rpn_proposal_root']
+        roi_pooled_feat_root = kwargs['yc2bb_roi_pooled_feat_root']
+        self.num_frm         = kwargs['yc2bb_num_frm']
+
+        self.load_type = kwargs['load_type']
+
+        self.max_objects = 20 
+        self.num_class   = kwargs['labels']
+        self.class_dict  = _get_class_labels(class_file)
+
+        sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type)
+
+        assert(len(sentences_proc) == len(segments_tuple))
+
+        #YC2 split names, slightly different
+        split_to_split = {'train':'training','val':'validation','test':'testing'}
+        self.yc2_split = split_to_split[self.load_type]
+
+	# read rpn object proposals
+        self.rpn_dict = {}
+        self.rpn_chunk = []
+
+        total_num_proposals = 100 # always load all the proposals we have
+        rpn_lst_file = os.path.join(rpn_proposal_root, self.yc2_split+'-box-'+str(total_num_proposals)+'.txt')
+        rpn_chunk_file = os.path.join(rpn_proposal_root, self.yc2_split+'-box-'+str(total_num_proposals)+'.pth')
+        key_counter = len(self.rpn_dict)
+        with open(rpn_lst_file) as f:
+            rpn_lst = f.readline().split(',')
+            self.rpn_dict.update({r.strip():(i+key_counter) for i,r in enumerate(rpn_lst)})
+
+        self.rpn_chunk.append(torch.load(rpn_chunk_file))
+
+        self.rpn_chunk = torch.cat(self.rpn_chunk).cpu()
+        assert(self.rpn_chunk.size(0) == len(self.rpn_dict))
+        assert(self.rpn_chunk.size(2) == 4)
+
+        self.num_proposals = num_proposals
+        self.roi_pooled_feat_root = roi_pooled_feat_root
+
+        #Extract all dictionary words from each input sentence
+        #Only for the training set b/c it's un-annotated
+        self.sample_obj_labels = []
+        idx_to_remove = []
+        if self.load_type == 'train':
+            total_seg = len(self.samples)
+            for idx, sample in enumerate(self.samples):
+                sentence = sample['frames'][0]['sentence'].split(' ')
+                obj_label = []
+                inc_flag = 0
+                for w in sentence:
+                    if self.class_dict.get(w,-1) >= 0:
+                        obj_label.append(self.class_dict[w]) 
+                        inc_flag = 1
+
+                if inc_flag:
+                    self.sample_obj_labels.append(obj_label)
+                else:
+                    idx_to_remove.append(idx)
+
+            #Remove segments without object from dictionay
+            self.samples[:] = [s for idx,s in enumerate(self.samples) if idx not in idx_to_remove]
+
+            assert(len(self.samples) == len(self.sample_obj_labels))
+
+            print('{}/{} valid segments in {} split'.format(len(self.samples), total_seg, self.load_type))
+
+        '''
+        if self.load_type=='train':
+            self.transforms = kwargs['model_obj'].train_transforms
+        else:
+            self.transforms = kwargs['model_obj'].test_transforms
+        '''
+
+    #Reverse-mapping between class index to canonical label name
+    def _get_class_labels_reverse(self):
+        return {v:k for k,v in self.class_dict.items()}
+    
+    #For the training set, extract positive and negative samples
+    def sample_rpn_regions(self, x_rpn, idx):
+        # randomly sample 5 frames from 5 uniform intervals
+        T = x_rpn.size(1)
+        itv = T*1./self.num_frm
+        ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)]
+        x_rpn = x_rpn[:, ind, :]
+
+        obj_label = self.sample_obj_labels[idx]
+
+        #Generate example
+        obj_tensor = torch.tensor(obj_label, dtype=torch.long)
+        obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding
+        sample     = [x_rpn, obj_tensor]
+
+        return sample 
+
+    def __getitem__(self, idx):
+        vid_info = self.samples[idx]
+        
+        base_path       = vid_info['base_path']
+        width, height   = vid_info['frame_size']
+        num_frames_1fps = len(vid_info['frames'])
+        rec             = base_path.split('/')[-3]
+        vid             = base_path.split('/')[-2]
+        seg             = base_path.split('/')[-1]
+
+        bbox_data   = np.zeros((self.max_objects, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax]
+        labels      = np.zeros(self.max_objects)-1
+
+        for frame_ind in range(num_frames_1fps):
+            frame      = vid_info['frames'][frame_ind]
+            #frame_path = frame['img_path']
+            num_objs    = len(frame['objs'])
+            obj_label   = np.zeros((num_objs))-1 #List all unique class ids in entire segment
+            
+            # Extract bbox and label data from video info
+            for obj_ind, obj in enumerate(frame['objs']):
+                label   = self.class_dict[obj['c']]
+                trackid = obj['trackid']
+
+                if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated
+                    bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] 
+                else:
+                    if obj['occ'] or obj['outside']:
+                        bbox_data[trackid, frame_ind] = [-1, -1, -1, -1, -1] 
+                    else:   
+                        obj_bbox  = obj['bbox'] # [xmin, ymin, xmax, ymax]
+
+                        #re-order to [ymin, xmin, ymax, xmax], rpn proposals are this way I believe
+                        new_order = [1,0,3,2]
+                        obj_bbox  = [obj_bbox[i] for i in new_order]
+                        bbox_data[trackid, frame_ind, :] = [label] + obj_bbox
+
+                obj_label[obj_ind] = label
+                labels[trackid]    = label 
+
+        #Only keep annotations for valid objects
+        bbox_data = bbox_data[:num_objs, :]
+        labels    = labels[:num_objs]
+
+        obj_label = torch.from_numpy(obj_label).long()
+        num_frames = num_frames_1fps * 25 #video sampled at 25 fps
+        
+        '''
+	if self.vis_output:
+            image_path = os.path.join(self.image_root, split, rec, vid, seg)
+            img_notrans = []
+            for i in range(num_frames):
+                img_notrans.append(self.spatial_transform_notrans(self.loader(os.path.join(image_path, '{:04d}.jpg'.format(i+1)))))
+            img_notrans = torch.stack(img_notrans, dim=1) # 3, T, H, W
+        else:
+            # no need to load raw images
+            img_notrans = torch.zeros(3, num_frames, 1, 1) # dummy
+        '''	
+
+	# rpn object propoals
+        rpn = []
+        x_rpn = []
+        frm=1
+
+        feat_name = vid+'_'+seg+'.pth'
+        img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+        x_rpn = torch.load(os.path.join(self.roi_pooled_feat_root, self.yc2_split, feat_name))
+        while self.rpn_dict.get(img_name, -1) > -1:
+            ind = self.rpn_dict[img_name]
+            rpn.append(self.rpn_chunk[ind])
+            frm+=1
+            img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+
+        rpn = torch.stack(rpn) # number of frames x number of proposals per frame x 4
+        rpn = rpn[:, :self.num_proposals, :]
+
+        x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals
+        x_rpn = x_rpn[:, :, :self.num_proposals]
+
+        rpn_original = rpn-1 # convert to 1-indexed
+
+        # normalize coordidates to 0-1
+        # coordinates are 1-indexed:  (x_tl, y_tl, x_br, y_br)
+        rpn[:, :, 0] = (rpn[:, :, 0]-0.5)/width
+        rpn[:, :, 2] = (rpn[:, :, 2]-0.5)/width
+        rpn[:, :, 1] = (rpn[:, :, 1]-0.5)/height
+        rpn[:, :, 3] = (rpn[:, :, 3]-0.5)/height
+
+        assert(torch.max(rpn) <= 1)
+
+        vis_name = '_-_'.join((self.yc2_split, rec, vid, seg))
+
+        ret_dict = dict()
+        annot_dict = dict()
+
+        if self.load_type == 'train': #Training input data is generated differently
+            #Generate postive example
+            pos_sample = self.sample_rpn_regions(x_rpn, idx)
+
+            #Sample negative index 
+            total_s = len(self.samples)
+            neg_index = np.random.randint(total_s)
+            #Shouldn't include any overlapping object in description
+            while len(set(obj_label).intersection(set(self.sample_obj_labels[neg_index]))) != 0:
+                neg_index = np.random.randint(total_s)
+
+            vid_info = self.samples[neg_index]
+            
+            base_path       = vid_info['base_path']
+            width, height   = vid_info['frame_size']
+            num_frames_1fps = len(vid_info['frames'])
+            rec             = base_path.split('/')[-3]
+            vid             = base_path.split('/')[-2]
+            seg             = base_path.split('/')[-1]
+
+            # rpn object propoals
+            rpn = []
+            x_rpn = []
+            frm=1
+
+            feat_name = vid+'_'+seg+'.pth'
+            img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+            x_rpn = torch.load(os.path.join(self.roi_pooled_feat_root, self.yc2_split, feat_name))
+            while self.rpn_dict.get(img_name, -1) > -1:
+                ind = self.rpn_dict[img_name]
+                rpn.append(self.rpn_chunk[ind])
+                frm+=1
+                img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg'
+
+            rpn = torch.stack(rpn) # number of frames x number of proposals per frame x 4
+            rpn = rpn[:, :self.num_proposals, :]
+
+            x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals
+            x_rpn = x_rpn[:, :, :self.num_proposals]
+
+            #Generate negative example
+            neg_sample = self.sample_rpn_regions(x_rpn, neg_index)
+
+            output = [torch.stack(i) for i in zip(pos_sample, neg_sample)]
+            output.append(self.load_type)
+            ret_dict['data'] = output 
+
+        else: #Validation or Testing set
+            ret_dict['data']     = [x_rpn, obj_label, self.load_type] 
+
+            annot_dict['box']               = bbox_data 
+            annot_dict['box_label']         = obj_label 
+            annot_dict['rpn']               = rpn
+            annot_dict['rpn_original']      = rpn_original 
+            annot_dict['vis_name']          = vis_name
+            annot_dict['class_labels_dict'] = self._get_class_labels_reverse()
+
+        ret_dict['annots']         = annot_dict
+
+        return ret_dict
+
+def _get_segments_and_sentences(data, split):
+    # build vocab and tokenized sentences
+    text_proc = torchtext.data.Field(sequential=True, tokenize='spacy',
+                                lower=True, batch_first=True)
+    split_sentences = []
+    split_segments = []
+
+    for dat in data:
+        rec    = dat['base_path'].split('/')[-3]
+        vid    = dat['base_path'].split('/')[-2]
+        seg    = dat['base_path'].split('/')[-1]
+        frame = dat['frames'][0]
+        segment_labels = []
+        if 'sentence' in frame: # for now, training json file only contains full sentence
+            segment_labels = frame['sentence']
+        else:
+            for obj in frame['objs']:
+                segment_labels.append(obj['c'])
+        split_sentences.append(segment_labels)
+        split_segments.append((split, rec, vid, str(seg).zfill(2))) #tuple of id (split, vid, seg)
+
+    sentences_proc = list(map(text_proc.preprocess, split_sentences)) # build vocab on train and val
+
+    print('{} sentences in {} split'.format(len(sentences_proc), split))
+
+    return sentences_proc, split_segments
+
+def _get_class_labels(class_file):
+    class_dict = {} # both singular form & plural form are associated with the same label
+    with open(class_file) as f:
+        cls = csv.reader(f, delimiter=',')
+        for i, row in enumerate(cls):
+            for r in range(1, len(row)):
+                if row[r]:
+                    class_dict[row[r]] = int(row[0])
+
+    return class_dict
+
diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py
index 6e7fe2e..a159b25 100644
--- a/datasets/abstract_datasets.py
+++ b/datasets/abstract_datasets.py
@@ -40,6 +40,9 @@ def __init__(self, *args, **kwargs):
         self.crop_type      = kwargs['crop_type'] 
         self.final_shape    = kwargs['final_shape']
 
+        #Experiment arguments
+        self.batch_size     = kwargs['batch_size']
+
         # Creates the self.samples list which will be indexed by each __getitem__ call
         self._getClips()
 
@@ -68,17 +71,22 @@ def _extractClips(self, video):
             self.clip_stride: Number of frames between clips when extracting them from videos 
             self.random_offset: Randomly select a clip_length sized clip from a video
         """
+        if self.clip_offset > 0:
+            if len(video)-self.clip_offset >= self.clip_length:
+                video = video[self.clip_offset:]
+
         if self.num_clips < 0:
             if len(video) >= self.clip_length:
+                # Uniformly sample one clip from the video
                 final_video = [video[_idx] for _idx in np.linspace(0, len(video)-1, self.clip_length, dtype='int32')]
                 final_video = [final_video]
 
             else:
                 # Loop if insufficient elements
-                indices = np.ceil(self.clip_length/float(len(video)))
+                indices = np.ceil(self.clip_length/float(len(video))) # Number of times to repeat the video to exceed one clip_length
                 indices = indices.astype('int32')
-                indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices)
-                indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')]
+                indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices) # Repeat the video indices until it exceeds a clip_length
+                indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')] # Uniformly sample clip_length frames from the looped video
 
                 final_video = [video[_idx] for _idx in indices]
                 final_video = [final_video]
@@ -87,8 +95,9 @@ def _extractClips(self, video):
             # END IF
 
         elif self.num_clips == 0:
+            # Divide entire video into the max number of clip_length segments
             if len(video) >= self.clip_length:
-                indices     = np.arange(start=0, stop=len(video), step=self.clip_length)
+                indices     = np.arange(start=0, stop=len(video)-self.clip_length+1, step=self.clip_stride)
                 final_video = []
 
                 for _idx in indices:
@@ -110,33 +119,47 @@ def _extractClips(self, video):
             # END IF                               
     
         else:
-            if self.random_offset:
-                if len(video) >= self.clip_length:
-                    indices = np.random.choice(np.arange(len(video) - self.clip_length + 1), 1)
-                    indices = indices.astype('int32')
-                    indices = np.arange(indices, indices + self.clip_length).astype('int32') 
+            # num_clips > 0, select exactly num_clips from a video
+
+            if self.clip_length == -1:
+                # This is a special case where we will return the entire video
 
-                    final_video = [video[_idx] for _idx in indices]
-                    final_video = [final_video]
+                # Batch size must equal one or dataloader items may have varying lengths 
+                # and can't be stacked i.e. throws an error
+                assert(self.batch_size == 1) 
+                return [video]
 
-                else:
-                    indices = np.ceil(self.clip_length/float(len(video)))
-                    indices = indices.astype('int32')
-                    indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices)
 
-                    index   = np.random.choice(np.arange(len(indices) - self.clip_length + 1), 1)[0]
-                    index   = index.astype('int32')
-                    indices = indices[index:index + self.clip_length]
+            required_length = (self.num_clips-1)*(self.clip_stride)+self.clip_length
 
-                    final_video = [video[_idx] for _idx in indices]
-                    final_video = [final_video]
 
-                # END IF
+            if self.random_offset:
+                if len(video) >= required_length:
+                    vid_start = np.random.choice(np.arange(len(video) - required_length + 1), 1)
+                    video = video[int(vid_start):]
+
+            if len(video) >= required_length:
+                # Get indices of sequential clips overlapped by a clip_stride number of frames
+                indices = np.arange(0, len(video), self.clip_stride)
+
+                # Select only the first num clips
+                indices = indices.astype('int32')[:self.num_clips]
+
+                video = np.array(video)
+                final_video = [video[np.arange(_idx, _idx+self.clip_length).astype('int32')].tolist() for _idx in indices]
 
             else:
-                final_video = video[:self.clip_length]
-                final_video = [final_video]
+                # If the video is too small to get num_clips given the clip_length and clip_stride, loop it until you can
+                indices = np.ceil(required_length /float(len(video)))
+                indices = indices.astype('int32')
+                indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices)
+
+                # Starting index of each clip
+                clip_starts = np.arange(0, len(indices), self.clip_stride).astype('int32')[:self.num_clips]
 
+                video = np.array(video)
+                final_video = [video[indices[_idx:_idx+self.clip_length]].tolist() for _idx in clip_starts]
+            
             # END IF
 
         # END IF
diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py
index 3f5cc10..3bf74d0 100644
--- a/datasets/preprocessing_transforms.py
+++ b/datasets/preprocessing_transforms.py
@@ -123,6 +123,22 @@ def resize_bbox(self, xmin, ymin, xmax, ymax, img_shape, resize_shape):
     
         return xmin_new, ymin_new, xmax_new, ymax_new 
 
+    def resize_pt_coords(self, x, y, img_shape, resize_shape):
+        # Get relative position for point coords within a frame, after it's resized
+
+        img_h = img_shape[0]
+        img_w = img_shape[1]
+    
+        res_h = resize_shape[0]
+        res_w = resize_shape[1]
+    
+        frac_h = res_h/float(img_h)
+        frac_w = res_w/float(img_w)
+
+        x_new = (x * frac_w).astype(int)
+        y_new = (y * frac_h).astype(int)
+
+        return x_new, y_new
 
     def __call__(self, clip, bbox=[]):
 
@@ -136,11 +152,16 @@ def __call__(self, clip, bbox=[]):
             out_clip.append(proc_frame)
             if bbox!=[]:
                 temp_bbox = np.zeros(bbox[frame_ind].shape)-1 
-                for class_ind in range(len(bbox[frame_ind])):
-                    if np.array_equal(bbox[frame_ind,class_ind],-1*np.ones(4)): #only annotated objects
+                for class_ind, box in enumerate(bbox[frame_ind]):
+                    if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
                         continue
-                    xmin, ymin, xmax, ymax = bbox[frame_ind, class_ind]
-                    proc_bbox = self.resize_bbox(xmin, ymin, xmax, ymax, frame.shape, (self.size_h, self.size_w))
+
+                    if box.shape[-1] == 2: #Operate on point coordinates
+                        proc_bbox = np.stack(self.resize_pt_coords(box[:,0], box[:,1], frame.shape, (self.size_h, self.size_w)),1)
+                    else: #Operate on bounding box
+                        xmin, ymin, xmax, ymax = box
+                        proc_bbox = self.resize_bbox(xmin, ymin, xmax, ymax, frame.shape, (self.size_h, self.size_w))
+
                     temp_bbox[class_ind,:] = proc_bbox
                 out_bbox.append(temp_bbox)
 
@@ -155,7 +176,7 @@ def __call__(self, clip, bbox=[]):
 
 
 class CropClip(PreprocTransform):
-    def __init__(self, xmin, xmax, ymin, ymax, *args, **kwargs):
+    def __init__(self, xmin=None, xmax=None, ymin=None, ymax=None, *args, **kwargs):
         super(CropClip, self).__init__(*args, **kwargs)
         self.crop_xmin = xmin
         self.crop_xmax = xmax
@@ -165,12 +186,23 @@ def __init__(self, xmin, xmax, ymin, ymax, *args, **kwargs):
         self.crop_h, self.crop_w = kwargs['crop_shape']
 
 
-    def _update_bbox(self, xmin, xmax, ymin, ymax):
+    def _update_bbox(self, xmin, xmax, ymin, ymax, update_crop_shape=False):
+        '''
+            Args:
+                xmin (Float, shape []):
+                xmax (Float, shape []):
+                ymin (Float, shape []):
+                ymax (Float, shape []):
+                update_crop_shape (Boolean): Update expected crop shape along with bbox update call 
+        '''
         self.crop_xmin = xmin
         self.crop_xmax = xmax
         self.crop_ymin = ymin
         self.crop_ymax = ymax
 
+        if update_crop_shape:
+            self.crop_h = ymax - ymin
+            self.crop_w = xmax - xmin 
 
     def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, crop_ymax):
         if (xmin >= crop_xmax) or (xmax <= crop_xmin) or (ymin >= crop_ymax) or (ymax <= crop_ymin):
@@ -198,8 +230,15 @@ def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, cro
     
         return xmin_new-crop_xmin, ymin_new-crop_ymin, xmax_new-crop_xmin, ymax_new-crop_ymin
 
+    def crop_coords(self, x, y, crop_xmin, crop_ymin, crop_xmax, crop_ymax):
+        if np.any(x >= crop_xmax) or np.any(x <= crop_xmin) or np.any(y >= crop_ymax) or np.any(y <= crop_ymin):
+            return -1*np.ones(x.shape), -1*np.ones(y.shape)
 
-        
+        x_new = np.clip(x, crop_xmin, crop_xmax)
+        y_new = np.clip(y, crop_ymin, crop_ymax)
+
+        return x_new-crop_xmin, y_new-crop_ymin 
+  
     def __call__(self, clip, bbox=[]):
         out_clip = []
         out_bbox = []
@@ -213,11 +252,15 @@ def __call__(self, clip, bbox=[]):
 
             if bbox!=[]:
                 temp_bbox = np.zeros(bbox[frame_ind].shape)-1 
-                for class_ind in range(len(bbox)):
-                    if np.array_equal(bbox[frame_ind,class_ind],-1*np.ones(4)): #only annotated objects
+                for class_ind, box in enumerate(bbox[frame_ind]):
+                    if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
                         continue
-                    xmin, ymin, xmax, ymax = bbox[frame_ind, class_ind]
-                    proc_bbox = self.crop_bbox(xmin, ymin, xmax, ymax, self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax)
+
+                    if box.shape[-1] == 2: #Operate on point coordinates
+                        proc_bbox = np.stack(self.crop_coords(box[:,0], box[:,1], self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax), 1)
+                    else: #Operate on bounding box
+                        xmin, ymin, xmax, ymax = box
+                        proc_bbox = self.crop_bbox(xmin, ymin, xmax, ymax, self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax)
                     temp_bbox[class_ind,:] = proc_bbox
                 out_bbox.append(temp_bbox)
 
@@ -305,6 +348,9 @@ def __init__(self, direction='h', p=0.5, *args, **kwargs):
         super(RandomFlipClip, self).__init__(*args, **kwargs)
         self.direction = direction
         self.p = p
+
+    def _update_p(self, p):
+        self.p = p
             
     def _random_flip(self):
         flip_prob = np.random.random()
@@ -314,17 +360,45 @@ def _random_flip(self):
             return 1
 
     def _h_flip(self, bbox, frame_size):
+        width = frame_size[1]
         bbox_shape = bbox.shape
         output_bbox = np.zeros(bbox_shape)-1
-        for bbox_ind in range(bbox_shape[0]):
-            xmin, ymin, xmax, ymax = bbox[bbox_ind] 
-            width = frame_size[1]
-            xmax_new = width - xmin 
-            xmin_new = width - xmax
-            output_bbox[bbox_ind] = xmin_new, ymin, xmax_new, ymax
-        return output_bbox 
+        for bbox_ind, box in enumerate(bbox):
+            if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
+                continue
+
+            if box.shape[-1] == 2: #Operate on point coordinates
+                x = box[:,0]
+                x_new = width - x
+
+                output_bbox[bbox_ind] = np.stack((x_new,box[:,1]),1)
+            else: #Operate on bounding box
+                xmin, ymin, xmax, ymax = box
+                xmax_new = width - xmin
+                xmin_new = width - xmax
+                output_bbox[bbox_ind] = xmin_new, ymin, xmax_new, ymax
+        return output_bbox
 
     def _v_flip(self, bbox, frame_size):
+        height = frame_size[0]
+        bbox_shape = bbox.shape
+        output_bbox = np.zeros(bbox_shape)-1
+        for bbox_ind, box in enumerate(bbox):
+            if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects
+                continue
+
+            if box.shape[-1] == 2: #Operate on point coordinates
+                y = box[:,1]
+                y_new = height - y
+
+                output_bbox[bbox_ind] = np.stack((box[:,0],y_new),1)
+            else: #Operate on bounding box
+                xmin, ymin, xmax, ymax = box
+                ymax_new = height - ymin
+                ymin_new = height - ymax
+                output_bbox[bbox_ind] = xmin, ymin_new, xmax, ymax_new
+        return output_bbox
+
         bbox_shape = bbox.shape
         output_bbox = np.zeros(bbox_shape)-1
         for bbox_ind in range(bbox_shape[0]):
@@ -470,6 +544,31 @@ def _rotate_bbox(self, bboxes, frame_shape, angle):
         return output_bboxes
 
 
+    def _rotate_coords(self, bboxes, frame_shape, angle):
+        angle = np.deg2rad(angle)
+        bboxes_shape = bboxes.shape
+        output_bboxes = np.zeros(bboxes_shape)-1
+        frame_h, frame_w = frame_shape[0], frame_shape[1] 
+        half_h = frame_h/2. 
+        half_w = frame_w/2. 
+
+        for bbox_ind in range(bboxes_shape[0]):
+            x, y = bboxes[bbox_ind].transpose()
+
+            pts  = (x-half_w, y-half_h)
+
+            pts = self._cart2pol(pts)
+
+            pts = (pts[0], pts[1]-angle)
+
+            pts = self._pol2cart(pts)
+
+            pts  = (pts[0]+half_w, pts[1]+half_h)
+
+            output_bboxes[bbox_ind,:,0] = (np.clip(pts[0], 0, frame_w-1))
+            output_bboxes[bbox_ind,:,1] = (np.clip(pts[1], 0, frame_h-1))
+
+        return output_bboxes
 
     def __call__(self, clip, bbox=[]):
         angle = np.random.choice(self.angles)
@@ -482,13 +581,203 @@ def __call__(self, clip, bbox=[]):
             bbox = np.array(bbox)
             output_bboxes = np.zeros(bbox.shape)-1
             for bbox_ind in range(bbox.shape[0]):
-                output_bboxes[bbox_ind] = self._rotate_bbox(bbox[bbox_ind], clip[0].shape, angle)
+                if bbox.shape[-1] == 2:
+                    output_bboxes[bbox_ind] = self._rotate_coords(bbox[bbox_ind], clip[0].shape, angle)
+                else:
+                    output_bboxes[bbox_ind] = self._rotate_bbox(bbox[bbox_ind], clip[0].shape, angle)
 
             return output_clip, output_bboxes 
 
         return output_clip
 
+class RandomTranslateClip(PreprocTransform):
+    """
+    Random horizontal and/or vertical shift on frames in a clip. All frames receive same shifting 
+    Shift will be bounded by object bounding box (if given). Meaning, object will always be in view
+    Input numpy array must be of type np.uint8
+
+    Args:
+        - translate (Tuple)
+            - max_x (float): maximum absolute fraction for horizontal shift 
+            - max_y (float): maximum absolute fraction for vertical shift 
+    """
+    def __init__(self, translate, **kwargs):
+        super(RandomTranslateClip, self).__init__(**kwargs)
+
+        self.max_x, self.max_y = translate
+
+        assert(self.max_x >= 0.0 and self.max_y >= 0.0)
+        assert(self.max_x < 1.0  and self.max_y < 1.0) #Cannot shift past image bounds
+
+    def _shift_frame(self, bbox, frame, tx, ty):
+        M       = np.array([[1, 0, tx],[0, 1, ty]], dtype=np.float) # 2 x 3 transformation matrix
+        out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0]))
+
+        if bbox is not None:
+            bbox_h = np.reshape(bbox, (-1,2)) #x-y coords
+            bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords
+
+            out_box = M @ bbox_h
+
+            if bbox.shape[-1] == 2: #Operate on point coordinates
+                out_box = np.reshape(out_box.transpose(), (bbox.shape[0], bbox.shape[1],2))
+            else: #Operate on bounding box
+                out_box = np.reshape(out_box.transpose(), (-1,4))
+
+            return out_frame, out_box 
+        else:
+            return out_frame 
+
+    def __call__(self, clip, bbox=[]):
+        out_clip = []
+        clip = self._to_numpy(clip)
+
+        frac_x = np.random.rand()*(2*self.max_x)-self.max_x 
+        frac_y = np.random.rand()*(2*self.max_y)-self.max_y  
+
+        if bbox != []:
+            out_bbox = []
+            
+            for frame, box in zip(clip,bbox):
+                img_h, img_w = frame.shape[:2] 
+                tx = int(img_w * frac_x)
+                ty = int(img_h * frac_y) 
+
+                #Bound translation amount so all objects remain in scene
+                if box.shape[-1] == 2: #Operate on point coordinates
+                    mask = box[:,:,0] != -1
+                    tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,0]))
+                    ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,1]))
+                    out_frame, out_box = self._shift_frame(box, frame, tx, ty)
+                    out_box[~mask] = -1*np.ones(2)
+
+                else: #Operate on bounding box 
+                    #bbox is bounding box object
+                    mask = box[:,0] != -1
+                    tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,2]))
+                    ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,3]))
+                    out_frame, out_box = self._shift_frame(box, frame, tx, ty)
+                    out_box[~mask] = -1*np.ones(4)
+
+                out_clip.append(out_frame)
+                out_bbox.append(out_box)
+
+            return out_clip, out_bbox 
+        else:
+            for frame in clip:
+                img_h, img_w = frame.shape[:2] 
+                tx = int(img_w * frac_x)
+                ty = int(img_h * frac_y) 
+
+                out_clip.append(self._shift_frame(None, frame, tx, ty))
+
+            return out_clip 
+
+class RandomZoomClip(PreprocTransform):
+    """
+    Random zoom on all frames in a clip. All frames receive same scaling
+    Scale will be bounded by object bounding box (if given). Meaning, object will always be in view
+    If zooming out, the borders will be filled with black.
+
+    >1: Zoom in
+    <1: Zoom out
+    =1: Same size
+
+    Args:
+        - scale (Tuple)
+            - min_scale (float): minimum scaling on frame 
+            - max_scale (float): maximum scaling on frame  
+    """
+    def __init__(self, scale, **kwargs):
+        super(RandomZoomClip, self).__init__(**kwargs)
+
+        self.min_scale, self.max_scale = scale
+
+        assert(self.min_scale > 0 and self.min_scale <= self.max_scale)
+
+    def _scale_frame(self, bbox, frame, sc):
+        M = cv2.getRotationMatrix2D((frame.shape[1]/2, frame.shape[0]/2), 0, sc) # 2 x 3 rotation matrix
+        out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0]))
+
+        if bbox is not None:
+            bbox_h = np.reshape(bbox, (-1,2)) #x-y coords
+            bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords
+
+            out_box = M @ bbox_h
+
+            if bbox.shape[-1] == 2: #Operate on point coordinates
+                out_box = np.reshape(out_box.transpose(), (bbox.shape[0], bbox.shape[1],2))
+            else: #Operate on bounding box
+                out_box = np.reshape(out_box.transpose(), (-1,4))
+
+            return out_frame, out_box 
+        else:
+            return out_frame 
+
+    def __call__(self, clip, bbox=[]):
+        out_clip = []
+        clip = self._to_numpy(clip)
+
+        sc = np.random.uniform(self.min_scale, self.max_scale) 
+
+        if bbox != []:
+            out_bbox = []
+            
+            for frame, box in zip(clip,bbox):
+                img_h, img_w = frame.shape[:2]
+                cx, cy = (img_w/2, img_h/2)
+
+                #Bound scaling so all objects remain in scene
+                if box.shape[-1] == 2: #Operate on point coordinates
+                    mask = box[:,:,0] != -1
+
+                    max_x = min(img_w, np.max(cx + sc * (box[mask,0] - cx)))
+                    min_x = max(0, np.min(cx + sc * (box[mask,0] - cx)))
+                    sx = (max_x - cx) / np.max(box[mask,0] - cx)
+                    if min_x == 0:
+                        sx = min(sx, (min_x - cx) / np.min(box[mask,0] - cx))
+
+                    max_y = min(img_h, np.max(cy + sc * (box[mask,1] - cy)))
+                    min_y = max(0, np.min(cy + sc * (box[mask,1] - cy)))
+                    sy = (max_y - cy) / np.max(box[mask,1] - cy)
+                    if min_y == 0:
+                        sy = min(sy, (min_y - cy) / np.min(box[mask,1] - cy))
+            
+                    sc = min(sx, sy)
+                    out_frame, out_box = self._scale_frame(box, frame, sc)
+                    out_box[~mask] = -1*np.ones(2)
+
+                else: #Operate on bounding box 
+                    mask = box[:,0] != -1
+
+                    max_x = min(img_w, np.max(cx + sc * (box[mask,2] - cx)))
+                    min_x = max(0, np.min(cx + sc * (box[mask,0] - cx)))
+                    sx = (max_x - cx) / np.max(box[mask,2] - cx)
+                    if min_x == 0:
+                        sx = min(sx, (min_x - cx) / np.min(box[mask,0] - cx))
+
+                    max_y = min(img_h, np.max(cy + sc * (box[mask,3] - cy)))
+                    min_y = max(0, np.min(cy + sc * (box[mask,1] - cy)))
+                    sy = (max_y - cy) / np.max(box[mask,3] - cy)
+                    if min_y == 0:
+                        sy = min(sy, (min_y - cy) / np.min(box[mask,1] - cy))
+            
+                    sc = min(sx, sy)
+                    out_frame, out_box = self._scale_frame(box, frame, sc)
+                    out_box[~mask] = -1*np.ones(4)
+
+                out_clip.append(out_frame)
+                out_bbox.append(out_box)
+
+            return out_clip, out_bbox 
+        else:
+            for frame in clip:
+                img_h, img_w = frame.shape[:2]
+                sx = int(img_w * sc)
+                sy = int(img_h * sc) 
 
+                out_clip.append(self._scale_frame(None, frame, sc))
+            return out_clip 
 
 class SubtractMeanClip(PreprocTransform):
     def __init__(self, **kwargs):
@@ -555,11 +844,31 @@ def __init__(self, **kwargs):
         self.transform = kwargs['transform'](**self.class_kwargs)
 
     def __call__(self, clip, bbox=[]):
+        input_pil = True
+        output_clip = []
+
         if not isinstance(clip[0], Image.Image):
             clip = self._to_pil(clip)
-        output_clip = []
-        for frame in clip:
-            output_clip.append(self.transform(frame))
+            clip = [frame.convert('RGB') for frame in clip]
+            input_pil = False
+
+        if input_pil:
+            for frame in clip:
+                transformed_frame = self.transform(frame)
+                if isinstance(transformed_frame, tuple) or isinstance(transformed_frame, list):
+                    for tf in transformed_frame:
+                        output_clip.append(tf)
+                else:
+                    output_clip.append(self.transform(frame)) #Apply transform and convert back to Numpy
+
+        else:
+            for frame in clip:
+                transformed_frame = self.transform(frame)
+                if isinstance(transformed_frame, tuple) or isinstance(transformed_frame, list):
+                    for tf in transformed_frame:
+                        output_clip.append(np.array(tf))
+                else:
+                    output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy
 
         if bbox!=[]:
             return output_clip, bbox
@@ -640,6 +949,8 @@ def __init__(self):
         self.rand_flip_h = RandomFlipClip(direction='h', p=1.0)
         self.rand_flip_v = RandomFlipClip(direction='v', p=1.0)
         self.rand_rot = RandomRotateClip(angles=[90])
+        self.rand_trans = RandomTranslateClip(translate=(0.5,0.5))
+        self.rand_zoom  = RandomZoomClip(scale=(1.25,1.25)) 
         self.sub_mean = SubtractMeanClip(clip_mean=np.zeros(1))
         self.applypil = ApplyToPIL(transform=torchvision.transforms.ColorJitter, class_kwargs=dict(brightness=1))
         self.applypil2 = ApplyToPIL(transform=torchvision.transforms.FiveCrop, class_kwargs=dict(size=(64,64)))
@@ -661,6 +972,14 @@ def resize_test(self):
         exp_bbox = np.array([[[0,0,1,2]]])
         assert (False not in np.isclose(bbox_out, exp_bbox))
 
+        coord_pts = np.array([[[[1,1], [7,5], [9,6]]]]).astype(float)
+        _, bbox_out = self.resize(inp, coord_pts)
+        exp_bbox = np.array([[[[0., 0.],
+                               [3., 3.],
+                               [4., 4.]]]])
+        assert (False not in np.isclose(bbox_out, exp_bbox))
+
+
     def crop_test(self):
         inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float)
         self.crop._update_bbox(1, 3, 1, 3)
@@ -707,12 +1026,14 @@ def rand_flip_vis(self):
         x[:, 50] = 5000
         x[10, :] = 5000
         x[50, :] = 10000
-        plt.imshow(x); plt.show()
+
+        plt.subplot(1,3,1); plt.imshow(x); plt.title('Original image')
         h = self.rand_flip_h([x])
-        plt.imshow(h[0]); plt.show()
+        plt.subplot(1,3,2); plt.imshow(h[0]); plt.title('Flip Horizontal')
         v = self.rand_flip_v([x])
-        plt.imshow(v[0]); plt.show()
-
+        plt.subplot(1,3,3); plt.imshow(v[0]); plt.title('Flip Vertical')
+        
+        plt.show()
 
     def rand_rot_test(self):
         inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float)
@@ -728,37 +1049,107 @@ def rand_rot_test(self):
         out_bbox = self.rand_rot([inp2], np.array([bbox]))[1][0].tolist()
         assert (False not in np.isclose(out, exp_out)) and (False not in np.isclose(exp_bbox, out_bbox))
 
+
+    def rand_trans_test(self):
+        x = np.arange(112*112).reshape(112,112).astype(np.uint8)
+        out = self.rand_trans([x])
+        out2 = self.rand_trans([x], bbox=[np.array([[32,32,96,96]])])
+
+        assert (out2[1][0].min() >= 0) and (out[0].shape==(112,112)) and (out2[0][0].shape==(112,112))
+
     def rand_rot_vis(self):
         import matplotlib.pyplot as plt
-        self.rand_rot._update_angles([20])
+        import matplotlib.patches as patches 
+        angle = 45
+        self.rand_rot._update_angles([angle])
         x = np.arange(112*112).reshape(112,112)
-        #x = np.arange(6*6).reshape(6,6)
-        #bbox = [51,51,61,61]
+
         bbox = [30,40,50,100]
-        bbox = [30,40,50,110]
-        #bbox = [2,2,4,4]
-        plt1 = x[:]
-        plt1[bbox[1]:bbox[3], bbox[0]] = 0
-        plt1[bbox[1]:bbox[3], bbox[2]-1] = 0
-        plt1[bbox[1], bbox[0]:bbox[2]] = 0
-        plt1[bbox[3]-1, bbox[0]:bbox[2]] = 0
-        plt.imshow(plt1); plt.show()
+        pts = np.array([[30,40],[30,80]])
+        fig = plt.figure()
+        ax1 = fig.add_subplot(121)
+        x[bbox[1]:bbox[3], bbox[0]] = 0
+        x[bbox[1]:bbox[3], bbox[2]-1] = 0
+        x[bbox[1], bbox[0]:bbox[2]] = 0
+        x[bbox[3]-1, bbox[0]:bbox[2]] = 0
+        
+        ax1.imshow(x); ax1.set_title('Original image')
+        rect = patches.Rectangle((bbox[0],bbox[1]), bbox[2]-bbox[0],\
+                                  bbox[3]-bbox[1], linewidth=1, edgecolor='k', facecolor='none')
+        #ax1.add_patch(rect)
+        ax1.scatter(pts[:,0], pts[:,1], c='r')
+
         out2 = self.rand_rot([x], np.array([[bbox]]))
-        plt2 = out2[0][0]
-        bbox = out2[1][0][0].astype(int)
-        plt2[bbox[1]:bbox[3], bbox[0]] = 0
-        plt2[bbox[1]:bbox[3], bbox[2]] = 0
-        plt2[bbox[1], bbox[0]:bbox[2]] = 0
-        plt2[bbox[3], bbox[0]:bbox[2]] = 0
-        plt.imshow(plt2); plt.show()
+        x_rot = out2[0][0]
+        bbox_rot = out2[1][0,0]
+
+        out2 = self.rand_rot([x], np.array([[pts]]))
+        pts_rot  = out2[1][0,0]
+
+        ax2 = fig.add_subplot(122)
+        rect = patches.Rectangle((bbox_rot[0],bbox_rot[1]), bbox_rot[2]-bbox_rot[0],\
+                                  bbox_rot[3]-bbox_rot[1], linewidth=1, edgecolor='k', facecolor='none')
+        ax2.add_patch(rect)
+        ax2.imshow(x_rot); ax2.set_title('Rotation: {} degress'.format(angle))
+        ax2.scatter(pts_rot[:,0],pts_rot[:,1], c='r')
+        plt.show()
+
+    def rand_zoom_test(self):
+        inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float)
+        exp_out = np.array([[0.225   , 0.303125, 0.384375],
+                            [0.459375, 0.5375  , 0.61875 ],
+                            [0.703125, 0.78125 , 0.8625  ]]).astype(float)
+        out = self.rand_zoom(inp)
+
+        inp2 = np.arange(6*6, dtype=np.uint8).reshape(6,6)
+        bbox = [[2,2,4,4]]
+        exp_bbox = [1.75,1.75,4.25,4.25]
+        _,out_bbox = self.rand_zoom([inp2], np.array([bbox]))
+
+        assert (False not in np.isclose(out, exp_out)) and (False not in np.isclose(exp_bbox, out_bbox))
+
+    def rand_zoom_vis(self):
+        import matplotlib.pyplot as plt
+        import matplotlib.patches as patches 
+        x = np.arange(112*112, dtype=np.uint8).reshape(112,112)
+
+        bbox = [30,40,50,100]
+        pts = np.array([[30,40],[30,80]])
+        fig = plt.figure()
+        ax1 = fig.add_subplot(121)
+
+        x[bbox[1]:bbox[3], bbox[0]] = 0
+        x[bbox[1]:bbox[3], bbox[2]-1] = 0
+        x[bbox[1], bbox[0]:bbox[2]] = 0
+        x[bbox[3]-1, bbox[0]:bbox[2]] = 0
+        ax1.imshow(x); ax1.set_title('Original image')
+        ax1.scatter(pts[:,0], pts[:,1], c='r')
+
+        out = self.rand_zoom([x], np.array([[pts]]))
+        pts_zoom = out[1][0][0]
+
+        out = self.rand_zoom([x], np.array([[bbox]]))
+        x_zoom = out[0][0]
+        bbox_zoom = out[1][0][0]
+
+        ax2 = fig.add_subplot(122)
+        rect = patches.Rectangle((bbox_zoom[0],bbox_zoom[1]), bbox_zoom[2]-bbox_zoom[0],\
+                                  bbox_zoom[3]-bbox_zoom[1], linewidth=1, edgecolor='k', facecolor='none')
+        ax2.add_patch(rect)
+        ax2.imshow(x_zoom); ax2.set_title('Zoomed image')
+        ax2.scatter(pts_zoom[:,0],pts_zoom[:,1], c='r')
+        
+        plt.show()
 
     def applypil_test(self):
         inp = np.arange(112*112).reshape(112,112)
+        np_inp = [inp, inp]
         inp = self.applypil._to_pil([inp, inp])
         inp = [inp[0].convert('RGB'), inp[1].convert('RGB')]
-        out1 = self.applypil(inp)
-        out = self.applypil2(out1)
-        assert (len(out)==2) and (len(out[0])==5) and (out[0][0].size==(64,64)) and (isinstance(out[0][0], Image.Image))
+        out = self.applypil(inp)
+        out2 = self.applypil2(out)
+        out3 = self.applypil(np_inp)
+        assert (len(out2)==2*5) and (out2[0].size==(64,64)) and (isinstance(out2[0], Image.Image)) and (isinstance(out3[0], np.ndarray))
 
     def applytensor_test(self):
         inp = np.arange(112*112*3).reshape(3,112,112).astype('float32')
@@ -808,6 +1199,8 @@ def run_tests(self):
         self.rand_crop_test()
         self.rand_flip_test()
         self.rand_rot_test()
+        self.rand_trans_test()
+        self.rand_zoom_test()
         self.applypil_test()
         self.applytensor_test()
         self.applycv_test()
@@ -815,12 +1208,11 @@ def run_tests(self):
         self.to_pil_test()
         self.to_numpy_test()
         print("Tests passed")
-        #self.rand_flip_vis()
-        #self.rand_rot_vis()
-        
-
-
 
+        self.rand_flip_vis()
+        self.rand_rot_vis()
+        self.rand_zoom_vis()
+        
 if __name__=='__main__':
     test = TestPreproc()
     test.run_tests()
diff --git a/datasets/scripts/gen_json_DHF1K.py b/datasets/scripts/gen_json_DHF1K.py
new file mode 100644
index 0000000..02c05eb
--- /dev/null
+++ b/datasets/scripts/gen_json_DHF1K.py
@@ -0,0 +1,74 @@
+import os
+import cv2
+import json
+
+
+def get_split(base_vid_path):
+    vids = os.listdir(base_vid_path)
+    vids = [int(vid) for vid in vids]
+    vids.sort()
+
+    # Out of the 1000 videos, the first 600 are annotated for training, 601-700 annotated for val, 701-1000 not annotated must be sent in to test
+    train_cutoff = 600
+    val_cutoff = 700
+    train_vids = vids[:vids.index(600)+1] 
+    val_vids = vids[vids.index(600)+1:vids.index(700)+1] 
+    test_vids = vids[vids.index(700)+1:]
+    
+    train_vids = [str(vid).zfill(3) for vid in train_vids]
+    test_vids  = [str(vid).zfill(3) for vid in test_vids]
+    val_vids   = [str(vid).zfill(3) for vid in val_vids]
+    annot_train_vids = [vid.zfill(4) for vid in train_vids]
+    annot_val_vids = [vid.zfill(4) for vid in val_vids]
+    return train_vids, test_vids, val_vids, annot_train_vids, annot_val_vids
+
+
+def save_json(load_type):
+    base_vid_path = '/path/to/DHF1K/video_png'
+    base_annot_path = '/path/to/DHF1K/annotation'
+    output_path = '/any/path/'
+   
+    train_vids, test_vids, val_vids, annot_train, annot_val = get_split(base_vid_path)
+    
+    if load_type == 'train':
+        tv_vids = train_vids
+        tv_ann = annot_train
+    elif load_type == 'val':
+        tv_vids = val_vids
+        tv_ann = annot_val
+
+    else:
+        tv_vids = test_vids
+        tv_ann = []
+
+    json_dat = [] 
+    for vid in sorted(tv_vids):
+        vid_dict = {}
+        frames = []
+        frame_size = []
+        for img in sorted(os.listdir(os.path.join(base_vid_path, vid))):
+            if frame_size == []:
+                frame_shape = cv2.imread(os.path.join(base_vid_path, vid, img)).shape
+                frame_size = [frame_shape[1], frame_shape[0]] # Width, Height
+            frame_dict = {}
+            frame_dict['img_path'] = img
+            if load_type != 'test':
+                frame_dict['map_path'] = os.path.join(base_annot_path, tv_ann[tv_vids.index(vid)], 'maps', img)
+                frame_dict['bin_path'] = os.path.join(base_annot_path, tv_ann[tv_vids.index(vid)], 'fixation', img)
+            else:
+                frame_dict['map_path'] = '' 
+                frame_dict['bin_path'] = ''
+
+            frames.append(frame_dict)
+        vid_dict['base_path'] = os.path.join(base_vid_path, vid)
+        vid_dict['frames'] = frames
+        vid_dict['frame_size'] = frame_size
+        json_dat.append(vid_dict)
+
+    writef = open(os.path.join(output_path,load_type+'.json'), 'w')
+    json.dump(json_dat, writef)
+    writef.close()
+
+save_json('train')
+save_json('val')
+save_json('test')
diff --git a/datasets/scripts/gen_json_mscoco.py b/datasets/scripts/gen_json_mscoco.py
index 1a6409a..c7db705 100755
--- a/datasets/scripts/gen_json_mscoco.py
+++ b/datasets/scripts/gen_json_mscoco.py
@@ -2,22 +2,25 @@
 import os
 
 
-def save_json(load_type):
+year = '2014'
 
+def save_json(load_type):
+    
     # Define path to mscoco images data
     base_img_path = '/path/to/mscoco/images/'       ###### REPLACE with path to dataset
     base_annot_path = '/path/to/mscoco/annotations/'###### REPLACE with path to dataset
 
-    f = open(os.path.join(base_annot_path,'instances_'+load_type+'2014.json'),'r')
-    x = json.load(f)
-    f.close()
+    save_location = '/path/to/save/location' ######### REPLACE with save path
+
+    with open(os.path.join(base_annot_path,'instances_'+load_type+year+'.json'),'r') as f:
+        x = json.load(f)
     
     imgids = [[idx['id'], idx['file_name'], idx['width'], idx['height']] for idx in x['images']]
     
     dd = {}
     for idx in imgids:
         frame_dict = dict(objs=[], img_path=idx[1]) 
-        dd[idx[0]] = dict(frames=[frame_dict], base_path=os.path.join(base_img_path,load_type+'2014'), frame_size=[idx[2],idx[3]])
+        dd[idx[0]] = dict(frames=[frame_dict], base_path=os.path.join(base_img_path,load_type+year), frame_size=[idx[2],idx[3]])
     
     
     print('finished imgids')
@@ -36,10 +39,8 @@ def save_json(load_type):
         if count%1000==0:
             print(count)
     
-    writef = open('mscoco_'+load_type+'.json', 'w')
-    json.dump(dd.values(), writef)
-    writef.close()
-    
+    with open(os.path.join(save_location,load_type+'.json'), 'w') as f:
+        json.dump(list(dd.values()), f)
                 
  
 save_json('train')
diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py
new file mode 100644
index 0000000..4f4b8f2
--- /dev/null
+++ b/datasets/scripts/gen_json_yc2bb.py
@@ -0,0 +1,87 @@
+#Convert YC2-BB JSON annotation files to ViP JSON format
+
+import os 
+import json
+
+source_root = '$ANNOTATIONS_ROOT/annotations' #replace this value
+target_root = '$JSON_TARGET_ROOT' #replace this value
+#Link to videos sampled at 1 fps
+frame_root  = '$SAMPLED_FRAMES_ROOT' #replace this value
+files = ['yc2_training_vid.json', 'yc2_bb_val_annotations.json', 'yc2_bb_public_test_annotations.json']
+
+splits    = ['train', 'val', 'test']
+ann_files = [os.path.join(source_root, f) for f in files]
+
+for split, ann_file in zip(splits, ann_files):
+
+
+    #YC2 split names, slightly different
+    split_to_split = {'train':'training','val':'validation','test':'testing'}
+    split_name = split_to_split[split]
+    
+    with open(ann_file) as f:
+        ann_json_data = json.load(f)
+
+    yc2_json_data = ann_json_data['database']
+    json_data = []
+
+    for vid_name in yc2_json_data.keys():
+        frm_height  = yc2_json_data[vid_name]['rheight']
+        frm_width   = yc2_json_data[vid_name]['rwidth']
+        recipe_type = yc2_json_data[vid_name]['recipe_type'] 
+        yc2_segments = yc2_json_data[vid_name]['segments']
+        
+        #Loop through segments, YC2 breaks down all each video into segment clips
+        for seg,item in sorted(yc2_segments.items()):
+            base_path   = os.path.join(frame_root, split_name, recipe_type, vid_name, str(seg).zfill(2))
+            frames = []
+            if 'objects' in item: #validation or testing file
+                num_objs   = len(item['objects'])
+                num_frames = len(item['objects'][0]['boxes'])
+
+                #Loop through frames
+                for f in range(num_frames):
+                    frame = {}
+                    objs = []
+
+                    #Loop through objects
+                    for track_id in range(num_objs):
+                        obj = item['objects'][track_id]
+
+                        cls_name = obj['label']
+                        box    = obj['boxes'][f]
+                        
+                        if len(box) == 0: #No annotations
+                            objs.append({'trackid':track_id, 'c':cls_name})
+                            continue 
+
+                        xmin = box['xtl']
+                        ymin = box['ytl']
+                        xmax = box['xbr']
+                        ymax = box['ybr']
+
+                        outside  = box['outside'] #outside or inside of frame
+                        occluded = box['occluded'] 
+
+                        objs.append({'trackid':track_id, 'c':cls_name, 'occ':occluded, 'outside':outside, 'bbox':[xmin, ymin, xmax, ymax]})
+
+                    frame['img_path'] = os.path.join(base_path, str(seg).zfill(2), str(f).zfill(2)+'.jpg') 
+                    frame['objs']     = objs 
+                    frame['seg']      = seg
+                    frames.append(frame) 
+            else: #training annotation file
+                frame = {}
+                objs = []
+
+                frame['sentence'] = yc2_segments[seg]['sentence'] 
+                frame['objs']     = objs 
+                frame['seg']      = seg
+                frames.append(frame) 
+
+            json_data.append({'frames':frames, 'base_path':base_path, 'frame_size':[frm_width, frm_height], 'recipe_type':recipe_type})
+
+    target_file = os.path.join(target_root, split+'.json')
+    print('Writing out to: {}'.format(target_file))
+    with open(target_file, 'w') as f:
+        json.dump(json_data, f)
+        
diff --git a/eval.py b/eval.py
index 2d5ed2e..43148c7 100644
--- a/eval.py
+++ b/eval.py
@@ -67,6 +67,9 @@ def eval(**args):
     if args['load_type'] == 'train_val':
         eval_loader = loader['valid']
 
+    elif args['load_type'] == 'train':
+        eval_loader = loader['train']
+
     elif args['load_type'] == 'test':
         eval_loader  = loader['test'] 
 
@@ -90,10 +93,19 @@ def eval(**args):
 
     with torch.no_grad():
         for step, data in enumerate(eval_loader):
-            x_input     = data['data'].to(device)
+            x_input     = data['data']
             annotations = data['annots']
 
-            outputs = model(x_input)
+            if isinstance(x_input, torch.Tensor):
+                outputs = model(x_input.to(device))
+            else:
+                for i, item in enumerate(x_input):
+                    if isinstance(item, torch.Tensor):
+                        x_input[i] = item.to(device)
+                outputs = model(*x_input)
+
+            # END IF
+
 
             acc = acc_metric.get_accuracy(outputs, annotations)
 
diff --git a/install.sh b/install.sh
index eda0fe7..c443541 100755
--- a/install.sh
+++ b/install.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
 
 pip3 install -r requirements.txt
+python -m spacy download en
 ./weights/download_weights.sh
diff --git a/losses.py b/losses.py
index c11d14d..ad3710e 100644
--- a/losses.py
+++ b/losses.py
@@ -1,10 +1,12 @@
-import torch 
-import torch.nn    as nn
 import numpy as np
 from scipy import ndimage
 import os
 import cv2
 
+import torch 
+import torch.nn as nn
+import torch.nn.functional as F
+
 
 class Losses(object):
     def __init__(self, *args, **kwargs): #loss_type, size_average=None, reduce=None, reduction='mean', *args, **kwargs):
@@ -27,6 +29,9 @@ def __init__(self, *args, **kwargs): #loss_type, size_average=None, reduce=None,
         elif self.loss_type == 'M_XENTROPY':
             self.loss_object = M_XENTROPY(*args, **kwargs)
 
+        elif self.loss_type == 'YC2BB_Attention_Loss':
+            self.loss_object = YC2BB_Attention_Loss(*args, **kwargs)
+
         else:
             print('Invalid loss type selected. Quitting!')
             exit(1)
@@ -107,3 +112,53 @@ def loss(self, predictions, data):
         one_hot = torch.Tensor(one_hot).cuda()
 
         return torch.mean(torch.sum(-one_hot * self.logsoftmax(predictions), dim=1))
+
+#Code source: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/train.py
+class YC2BB_Attention_Loss(object):
+    def __init__(self, *args, **kwargs):
+       """
+       Frame-wise attention loss used in Weakly-Supervised Object Video Grounding... 
+       https://arxiv.org/pdf/1805.02834.pdf
+       
+       Weakly-supervised, no groundtruth labels are used.
+       """
+
+       self.loss_weighting = kwargs['has_loss_weighting']
+       self.obj_interact   = kwargs['obj_interact']
+       self.ranking_margin = kwargs['ranking_margin']
+       self.loss_factor    = kwargs['loss_factor']
+
+    def loss(self, predictions, data):
+        """
+        Args:
+            predictions (List): 
+                - output (Tensor, shape [2*T, 2]): Positive and negative attention weights for each sample
+                - loss_weigh (Tensor, shape [2*T, 1]): Loss weighting applied to each sampled frame
+            data        (None) 
+
+            T: number of sampled frames from video (default: 5)
+        Return:
+            Frame-wise weighting loss 
+        """
+        output, loss_weigh = predictions
+
+        if self.loss_weighting or self.obj_interact: 
+            rank_batch = F.margin_ranking_loss(output[:,0:1], output[:,1:2], 
+                torch.ones(output.size()).type(output.data.type()), margin=self.ranking_margin, reduction='none')
+            if self.loss_weighting and self.obj_interact:
+                loss_weigh = (output[:, 0:1]+loss_weigh)/2. # avg
+            elif self.loss_weighting:
+                loss_weigh = output[:,0:1]
+            else:
+                loss_weigh = loss_weigh.unsqueeze(1)
+            # ranking loss
+            cls_loss = self.loss_factor*(rank_batch*loss_weigh).mean()+ \
+                        (1-self.loss_factor)*-torch.log(2*loss_weigh).mean()
+        else:
+            # ranking loss
+            cls_loss = F.margin_ranking_loss(output[:,0:1], output[:,1:2],
+                torch.Tensor([[1],[1]]).type(output.data.type()), margin=self.ranking_margin)
+
+
+        return cls_loss
+
diff --git a/metrics.py b/metrics.py
index f4e22b3..d051bd7 100644
--- a/metrics.py
+++ b/metrics.py
@@ -1,6 +1,9 @@
-import torch
+import os
+import json 
 import numpy as np
 
+import torch
+
 class Metrics(object):
     def __init__(self, *args, **kwargs):
         """
@@ -21,6 +24,8 @@ def __init__(self, *args, **kwargs):
             self.metric_object = MAP(*args, **kwargs)
         elif self.metric_type == 'SSD_AP':
             self.metric_object = SSD_AP(*args, **kwargs)
+        elif self.metric_type == 'Box_Accuracy':
+            self.metric_object = Box_Accuracy(*args, **kwargs)
         else:
             self.metric_type = None
 
@@ -159,8 +164,8 @@ def __init__(self, threshold=0.5, num_points=101, *args, **kwargs):
         """
         Compute Average Precision (AP)
         Args:
-            threshold  (scalar): iou threshold 
-            num_points (scalar): number of points to average for the interpolated AP calculation
+            threshold  (float): iou threshold 
+            num_points (int): number of points to average for the interpolated AP calculation
 
         Return:
             None 
@@ -186,10 +191,10 @@ def compute_class_ap(self, tp, fp, npos):
         Args:
             tp   (Tensor, shape [N*D]): cumulative sum of true positive detections 
             fp   (Tensor, shape [N*D]): cumulative sum of false positive detections 
-            npos (Tensor, scalar): actual positives (from ground truth)
+            npos (Tensor, int): actual positives (from ground truth)
 
         Return:
-            ap (Tensor, scalar): average precision calculation
+            ap (Tensor, float): average precision calculation
         """
         
         #Values for precision-recall curve
@@ -218,7 +223,7 @@ def get_AP(self, predictions, targets):
             D_: ground truth detections 
 
         Return:
-            avg_ap (Tensor, scalar): mean ap across all classes 
+            avg_ap (Tensor, float): mean ap across all classes 
         """
 
         N,C,D,_ = predictions.shape
@@ -329,7 +334,7 @@ def __init__(self, threshold=torch.linspace(0.5,0.95,10), num_points=101, *args,
 
         Args:
             threshold  (Tensor, shape[10]): Calculate AP at each of these threshold values
-            num_points (scalar): number of points to average for the interpolated AP calculation
+            num_points (float): number of points to average for the interpolated AP calculation
         """
 
         self.threshold = threshold
@@ -411,7 +416,7 @@ def __init__(self, threshold=0.5, det=None, *args, **kwargs):
         Compute Average Recall (AR)
 
         Args:
-            threshold: (scalar)
+            threshold: (float)
             det: max number of detections per image (optional)
         """
         
@@ -457,11 +462,11 @@ def __init__(self, threshold=0.5, num_points=11, *args, **kwargs):
         """
         Compute Average Precision (AP)
         Args:
-            threshold    (scalar): iou threshold 
-            num_points   (scalar): number of points to average for the interpolated AP calculation
+            threshold    (float): iou threshold 
+            num_points   (int): number of points to average for the interpolated AP calculation
             final_shape  (list) : [height, width] of input given to CNN
             result_dir   (String): save detections to this location
-            ndata        (scalar): total number of datapoints in dataset 
+            ndata        (int): total number of datapoints in dataset 
 
         Return:
             None 
@@ -513,3 +518,174 @@ def get_accuracy(self, detections, data):
 
         return self.get_AP(self.predictions, self.targets) 
 
+class Box_Accuracy():
+    """
+    Box accuracy computation for YC2-BB model.
+    Adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/tools/test_util.py 
+
+    Args:
+        accu_thres: (float)  iou threshold
+        fps:        (int)    frames per second video annotations were sampled at
+        load_type:  (String) data split, only validation has publicly available annotations
+        ndata       (int):   total number of datapoints in dataset 
+
+    """
+    def __init__(self, *args, **kwargs):
+        from collections import defaultdict
+
+        self.result_dir = os.path.join(kwargs['result_dir'], 'submission_yc2_bb.json')
+        self.thresh     = kwargs['accu_thresh']
+        self.fps        = kwargs['fps']
+        self.debug      = kwargs['debug']
+        self.test_mode  = 1 if kwargs['load_type'] == 'test' else 0
+        self.IOU        = IOU()
+        self.ba_score   = defaultdict(list) #box accuracy metric
+
+        if self.test_mode:
+            print('*'*62)
+            print('* [WARNING] Eval unavailable for the test set! *\
+                 \n* Results will be saved to: '+self.result_dir+' *\
+                 \n* Please submit your results to the eval server!  *')
+            print('*'*62)
+
+        self.ndata = kwargs['ndata']
+        self.count = 0
+        
+        self.json_data = {}
+        self.database  = {}
+
+    def get_accuracy(self, predictions, data):
+        """
+        Args:
+            predictions: (Tensor, shape [N,W,T,D]), attention weight output from model
+            data:      (dictionary)
+                - rpn_original      (Tensor, shape [N,T,D,4]) 
+                - box               (Tensor, shape [N,O,T,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) 
+                - box_label         (Tensor, shape [N,W]) 
+                - vis_name          (List, shape [N]), unique segment identifier  
+                - class_labels_dict (dict, length 67) class index to class label mapping 
+
+            T: number of frames
+            D: dimension of features
+            O: number of objects to ground 
+            W: unique word in segment (from YC2BB class dictionary)
+        Return:
+           Box accuracy score  
+        """
+        attn_weights = predictions
+
+        N = attn_weights.shape[0] 
+        self.count += N
+
+        rpn_batch         = data['rpn_original']
+        box_batch         = data['box']
+        obj_batch         = data['box_label']
+        box_label_batch   = obj_batch 
+        vis_name          = data['vis_name']
+        class_labels_dict = data['class_labels_dict']
+
+        # fps is the frame rate of the attention map
+        # both rpn_batch and box_batch have fps=1
+        _, T_rp, num_proposals, _ = rpn_batch.size()
+        _, O, T_gt, _ = box_batch.size()
+        T_attn = attn_weights.size(2)
+
+        assert(T_rp == T_gt) # both sampled at 1fps
+        #print('# of frames in gt: {}, # of frames in resampled attn. map: {}'.format(T_gt, np.rint(T_attn/self.fps)))
+
+        hits, misses = [0 for o in range(O)], [0 for o in range(O)]
+
+        results = []
+        pos_counter = 0 
+        neg_counter = 0 
+        segment_dict = {} #segment dictionary - to output results to JSON file
+        all_objects = []
+
+        for o in range(O):
+            object_dict = {}
+            if box_label_batch[0, o] not in obj_batch[0, :]: 
+                print('object {} is not grounded!'.format(box_label_batch[0, o]))
+                continue # don't compute score if the object is not grounded
+            obj_ind_in_attn = (obj_batch[0, :] == box_label_batch[0, o]).nonzero().squeeze()
+            if obj_ind_in_attn.numel() > 1:
+                obj_ind_in_attn = obj_ind_in_attn[0]
+            else:
+                obj_ind_in_attn = obj_ind_in_attn.item()
+
+            new_attn_weights = attn_weights[0, obj_ind_in_attn]
+            _, max_attn_ind = torch.max(new_attn_weights, dim=1)
+
+            # uncomment this for the random baseline
+            # max_attn_ind = torch.floor(torch.rand(T_attn)*num_proposals).long()
+            label = class_labels_dict[box_label_batch[0,o].item()]
+            object_dict = {'label':label}
+        
+            boxes = []
+            for t in range(T_gt):
+                if box_batch[0,o,t,0] == -1: # object is outside/non-exist/occlusion
+                    boxes.append({'xtl':-1, 'ytl':-1, 'xbr':-1, 'ybr':-1, 'outside':1, 'occluded':1}) #object is either occluded or outside of frame 
+                    neg_counter += 1
+                    continue
+                pos_counter += 1
+                box_ind = max_attn_ind[int(min(np.rint(t*self.fps), T_attn-1))]
+                box_coord = rpn_batch[0, t, box_ind, :].view(4) # x_tl, y_tl, x_br, y_br
+                gt_box = box_batch[0,o,t][torch.Tensor([2,1,4,3]).type(box_batch.type()).long()].view(1,4) # inverse x and y
+
+                if self.IOU.get_accuracy(box_coord, gt_box.float())[0].item() > self.thresh:
+                    hits[o] += 1
+                else:
+                    misses[o] += 1
+
+                xtl = box_coord[0].item()
+                ytl = box_coord[1].item()
+                xbr = box_coord[2].item()
+                ybr = box_coord[3].item()
+                boxes.append({'xtl':xtl, 'ytl':ytl, 'xbr':xbr, 'ybr':ybr, 'outside':0, 'occluded':0}) 
+
+            object_dict['boxes'] = boxes
+            all_objects.append(object_dict)
+
+            results.append((box_label_batch[0, o].item(), hits[o], misses[o]))
+
+        segment_dict['objects'] = all_objects
+        #print('percentage of frames with box: {}'.format(pos_counter/(pos_counter+neg_counter)))
+        
+        for (i,h,m) in results:
+            self.ba_score[i].append((h,m))
+
+        #Annotations for the testing split are not publicly available
+        if self.test_mode: 
+            split, rec, video_name, segment = vis_name[0].split('_-_')
+
+            if video_name not in self.database:
+                self.database[video_name] = {}
+                self.database[video_name]['recipe_type'] = rec
+            if 'segments' not in self.database[video_name]:
+                self.database[video_name]['segments'] = {}
+
+            self.database[video_name]['segments'][int(segment)] = segment_dict 
+
+            #Predictions will be saved to JSON file (if not in debug mode)
+            if self.count >= self.ndata and not self.debug:
+                self.json_data['database'] = self.database
+
+                with open(self.result_dir, 'w') as f:
+                    json.dump(self.json_data, f)
+
+                print('Saved submission file to: {}'.format(self.result_dir))
+
+            return -1
+
+        ba_final = []
+        for k, r in self.ba_score.items():
+            cur_hit = 0 
+            cur_miss = 0 
+            for v in r:
+                cur_hit += v[0]
+                cur_miss += v[1]
+
+            if cur_hit+cur_miss != 0:
+                #print('BA for {}(...): {:.4f}'.format(k, cur_hit/(cur_hit+cur_miss)))
+                ba_final.append(cur_hit/(cur_hit+cur_miss))
+
+        return np.mean(ba_final)
diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml
index cf69c79..d95e622 100644
--- a/models/c3d/config_test.yaml
+++ b/models/c3d/config_test.yaml
@@ -1,7 +1,7 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive frames
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml
index 969982d..4456e0d 100644
--- a/models/c3d/config_train.yaml
+++ b/models/c3d/config_train.yaml
@@ -1,7 +1,7 @@
 # Preprocessing
 clip_length:       16                    # Number of frames within a clip 
 clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                     # Frame offset between successive frames
+clip_stride:       1                     # Frame offset between successive frames
 crop_shape:        [112,112]             # (Height, Width) of frame  
 crop_type:         Random                # Type of cropping operation (Random, Central and None)  
 final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
diff --git a/models/dvsa/config_test.yaml b/models/dvsa/config_test.yaml
new file mode 100644
index 0000000..49ad0f6
--- /dev/null
+++ b/models/dvsa/config_test.yaml
@@ -0,0 +1,68 @@
+# Preprocessing
+clip_length:       -1                    # Number of frames within a clip 
+clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
+clip_stride:       1                     # Frame offset between successive frames
+crop_shape:        [112,112]             # (Height, Width) of frame  
+crop_type:         Random                # Type of cropping operation (Random, Central and None)  
+final_shape:       [112,112]             # (Height, Width) of input to be given to CNN
+num_clips:         1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
+random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
+resize_shape:      [128,171]             # (Height, Width) to resize original data 
+sample_duration:   16                    # Temporal size of video to be provided as input to the model 
+sample_size:       112                   # Height of frame to be provided as input to the model
+subtract_mean:     ''                    # Subtract mean (R,G,B) from all frames during preprocessing
+
+# Experiment Setup 
+acc_metric:        Box_Accuracy          # Accuracy metric 
+batch_size:        1                     # Numbers of videos in a mini-batch 
+dataset:           YC2BB                 # Name of dataset 
+debug:             1                     # If True, do not plot, save, or create data files 
+epoch:             30                    # Total number of epochs 
+exp:               exp                   # Experiment name
+gamma:             0.5                   # Multiplier with which to change learning rate
+grad_max_norm:     1                     # Norm for gradient clipping
+json_path:         /path/to/yc2bb         # Path to the json file for the given dataset
+labels:            67                    # Number of total classes in the dataset
+load_type:         test                  # Environment selection, to include only training/training and validation/testing dataset
+loss_type:         YC2BB_Attention_Loss  # Loss function
+lr:                0.05                  # Learning rate
+milestones:        [10, 20]              # Epoch values to change learning rate     
+model:             DVSA                  # Name of model to be loaded  
+momentum:          0.9                   # Momentum value in optimizer
+num_workers:       2                     # Number of CPU worker used to load data
+opt:               sgd                   # Name of optimizer
+preprocess:        default               # String argument to select preprocessing type
+pretrained:        1                     # Load pretrained network 
+pseudo_batch_loop: 1                     # Pseudo-batch size multiplier to mimic large minibatches 
+rerun:             1                     # Number of trials to repeat an experiment
+save_dir:          './results'           # Path to results directory
+seed:              999                   # Seed for reproducibility 
+weight_decay:      0.0005                # Weight decay
+
+# Dataset specific config
+yc2bb_class_file:               '/path/to/yc2bb/data/class_file.csv' #https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/data/class_file.csv 
+yc2bb_num_frm:                  5
+yc2bb_num_proposals:            20
+yc2bb_roi_pooled_feat_root:     '/path/to/yc2bb/data/yc2/roi_pooled_feat' #roi_pooled feat download links below
+#train: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_train.tar.gz (113 GB)
+#val: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_val.tar.gz (38 GB)
+#test: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_test.tar.gz (17 GB)
+yc2bb_rpn_proposal_root:        '/path/to/yc2bb/data/yc2/roi_box' #http://youcook2.eecs.umich.edu/static/dat/yc2_bb/all-box-100.tar.gz
+
+# Model specific config
+attn_drop:              0.2
+dropout:                0.2
+enc_size:               128
+has_loss_weighting:     1
+hidden_size:            256
+input_size:             2048
+loss_factor:            0.9
+n_heads:                4
+n_layers:               1
+#num_class:              67 #NOTE:redundant with labels
+obj_interact:           1
+ranking_margin:         0.1
+
+# Box accuracy config 
+accu_thresh:            0.5 
+fps:                    1
diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py
new file mode 100644
index 0000000..702e020
--- /dev/null
+++ b/models/dvsa/dvsa.py
@@ -0,0 +1,157 @@
+#Code heavily adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/model/dvsa.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import numpy as np
+from functools import partial
+import os
+
+from models.dvsa.dvsa_utils.transformer import Transformer
+
+class DVSA(nn.Module):
+    """
+    Deep Visual-Semantic Alignments (DVSA). 
+    Implementation used as baseline in Weakly-Supervised Video Object Grounding...
+    Source: https://arxiv.org/pdf/1805.02834.pdf
+    
+    Original paper: Deep visual-semantic alignments for generating image descriptions
+    https://cs.stanford.edu/people/karpathy/cvpr2015.pdf
+    """
+    def __init__(self, **kwargs):
+        super().__init__()
+        num_class          = kwargs['labels']
+        input_size         = kwargs['input_size']
+        enc_size           = kwargs['enc_size']
+        dropout            = kwargs['dropout']
+        hidden_size        = kwargs['hidden_size']
+        n_layers           = kwargs['n_layers']
+        n_heads            = kwargs['n_heads']
+        attn_drop          = kwargs['attn_drop']
+        num_frm            = kwargs['yc2bb_num_frm']
+        has_loss_weighting = kwargs['has_loss_weighting']
+        
+        # encode the region feature
+        self.feat_enc = nn.Sequential(
+            nn.Linear(input_size, enc_size),
+            nn.Dropout(p=dropout),
+            nn.ReLU()
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+        # lookup table for object label embedding
+        self.obj_emb = nn.Embedding(num_class+1, enc_size) # +1 for the dummy paddings
+        self.num_class = num_class
+
+        self.obj_interact = Transformer(enc_size, 0, 0,
+            d_hidden=hidden_size,
+            n_layers=n_layers,
+            n_heads=n_heads,
+            drop_ratio=attn_drop)
+
+        self.obj_interact_fc = nn.Sequential(
+            nn.Linear(enc_size*2, int(enc_size/2)),
+            nn.ReLU(),
+            nn.Linear(int(enc_size/2), 5), # object interaction guidance (always 5 snippets)
+            nn.Sigmoid()
+        )
+
+        self.num_frm = num_frm 
+        self.has_loss_weighting = has_loss_weighting
+
+        if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']:
+            self._load_pretrained_weights()
+
+    def forward(self, x_o, obj, load_type):
+        is_evaluate = 1 if load_type[0] == 'test' or load_type[0] == 'val' else 0
+        if is_evaluate:
+            return self.output_attn(x_o, obj)
+
+        #only a single batch expected
+        x_o = x_o[0]  
+        obj = obj[0]
+
+        x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous()
+
+        x_o = torch.stack([x_o[0], x_o[1], x_o[0]])
+        obj = torch.stack([obj[0], obj[0], obj[1]])
+
+        N, C_out, T, num_proposals = x_o.size()
+        assert(N == 3) # two pos samples and one neg sample
+
+        # attention
+        O = obj.size(1)
+        attn_key = self.obj_emb(obj)
+
+        num_pos_obj = torch.sum(obj[0]<self.num_class).long().item()
+        num_neg_obj = torch.sum(obj[2]<self.num_class).long().item()
+        # object interaction guidance
+        attn_key_frm_feat = attn_key[0:1, :num_pos_obj] # cat visual feature
+        obj_attn_emb,_ = self.obj_interact(attn_key_frm_feat)
+        obj_attn_emb = obj_attn_emb[:, :num_pos_obj, :]
+        obj_attn_emb = torch.cat((obj_attn_emb, attn_key[0:1, :num_pos_obj], ), dim=2)
+        obj_attn_emb = self.obj_interact_fc(obj_attn_emb) # N, O, 5
+        
+        itv = math.ceil(T/5)
+        tmp = [] # expand obj_attn_emb to N, O, T
+        for i in range(5):
+            l = min(itv*(i+1), T)-itv*i
+            if l>0:
+                tmp.append(obj_attn_emb[:, :, i:(i+1)].expand(1, num_pos_obj, l))
+        obj_attn_emb = torch.cat(tmp, 2).squeeze(0)
+        assert(obj_attn_emb.size(1) == self.num_frm)
+
+        loss_weigh = torch.mean(obj_attn_emb, dim=0)
+        loss_weigh = torch.cat((loss_weigh, loss_weigh)).unsqueeze(1)
+
+        if self.has_loss_weighting:
+            # dot-product attention
+            x_o = x_o.view(N, 1, C_out, T, num_proposals)
+            attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1, 1)).sum(2)/math.sqrt(C_out))
+
+            pos_weights = attn_weights[0, :num_pos_obj, :, :]
+            neg1_weights = attn_weights[1, :num_pos_obj, :, :]
+            neg2_weights = attn_weights[2, :num_neg_obj, :, :]
+
+            return torch.cat((torch.stack((torch.mean(torch.max(pos_weights, dim=2)[0], dim=0), torch.mean(torch.max(neg1_weights, dim=2)[0], dim=0)), dim=1),
+                torch.stack((torch.mean(torch.max(pos_weights, dim=2)[0], dim=0), torch.mean(torch.max(neg2_weights, dim=2)[0], dim=0)), dim=1))), loss_weigh
+        else:
+            # dot-product attention
+            x_o = x_o.view(N, 1, C_out, T*num_proposals)
+            attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1)).sum(2)/math.sqrt(C_out))
+
+            pos_weights = attn_weights[0, :num_pos_obj, :]
+            neg1_weights = attn_weights[1, :num_pos_obj, :]
+            neg2_weights = attn_weights[2, :num_neg_obj, :]
+
+            return torch.stack((torch.stack((torch.mean(torch.max(pos_weights, dim=1)[0]), torch.mean(torch.max(neg1_weights, dim=1)[0]))),
+                torch.stack((torch.mean(torch.max(pos_weights, dim=1)[0]), torch.mean(torch.max(neg2_weights, dim=1)[0]))))), loss_weigh
+
+    def output_attn(self, x_o, obj):
+        x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous()
+
+        N, C_out, T, num_proposals = x_o.size()
+        assert(N == 1)
+
+        # attention
+        O = obj.size(1)
+        attn_key = self.obj_emb(obj)
+
+        # dot-product attention
+        x_o = x_o.view(N, 1, C_out, T*num_proposals)
+        attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1)).sum(2)/math.sqrt(C_out))
+        # attn_weights = self.sigmoid((x_e*attn_key.view(N, O, C_out, 1).expand(N, O, C_out, T*num_proposals)).sum(2)) # N, O, T, H*W
+
+        # additive attention
+        # x_e = x_o.view(N, 1, C_out, T, H*W).contiguous().expand(N, O, C_out, T, H*W)
+        # attn_e = attn_key.view(N, O, C_out, 1, 1).expand(N, O, C_out, T, H*W)
+        # attn_weights = self.attn_mlp(torch.cat((x_e, attn_e), dim=2).permute(0,1,3,4,2).contiguous()).squeeze(4) # N, O, T, H*W
+
+        return attn_weights.view(N, O, T, num_proposals)
+
+    def _load_pretrained_weights(self):
+        state_dict = torch.load('weights/yc2bb_full-model.pth', map_location=lambda storage, location: storage)
+
+        self.load_state_dict(state_dict)
diff --git a/models/dvsa/dvsa_utils/transformer.py b/models/dvsa/dvsa_utils/transformer.py
new file mode 100644
index 0000000..38f9e3a
--- /dev/null
+++ b/models/dvsa/dvsa_utils/transformer.py
@@ -0,0 +1,190 @@
+# Originally from https://github.com/salesforce/densecap
+"""
+ Copyright (c) 2018, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+# Last modified by Luowei Zhou on 07/01/2018
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd import Variable
+
+import random
+import string
+import sys
+import math
+import uuid
+import numpy as np
+
+INF = 1e10
+
+def positional_encodings_like(x, t=None):
+    if t is None:
+        positions = torch.arange(0, x.size(1))
+        if x.is_cuda:
+           positions = positions.cuda(x.get_device())
+    else:
+        positions = t
+    encodings = x.new(*x.size()[1:]).fill_(0)
+    if x.is_cuda:
+        encodings = encodings.cuda(x.get_device())
+
+
+    for channel in range(x.size(-1)):
+        if channel % 2 == 0:
+            encodings[:, channel] = torch.sin(
+                positions.float() / 10000 ** (channel / x.size(2)))
+        else:
+            encodings[:, channel] = torch.cos(
+                positions.float() / 10000 ** ((channel - 1) / x.size(2)))
+    return Variable(encodings)
+
+class Linear(nn.Linear):
+
+    def forward(self, x):
+        size = x.size()
+        return super().forward(
+            x.contiguous().view(-1, size[-1])).view(*size[:-1], -1)
+
+# F.softmax has strange default behavior, normalizing over dim 0 for 3D inputs
+# deprecated since PyTorch 0.3
+# def softmax(x):
+#     if x.dim() == 3:
+#         return F.softmax(x.transpose(0, 2)).transpose(0, 2)
+#     return F.softmax(x)
+
+# torch.matmul can't do (4, 3, 2) @ (4, 2) -> (4, 3)
+def matmul(x, y):
+    if x.dim() == y.dim():
+        return x @ y
+    if x.dim() == y.dim() - 1:
+        return (x.unsqueeze(-2) @ y).squeeze(-2)
+    return (x @ y.unsqueeze(-2)).squeeze(-2)
+
+class LayerNorm(nn.Module):
+
+    def __init__(self, d_model, eps=1e-6):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(d_model))
+        self.beta = nn.Parameter(torch.zeros(d_model))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.gamma * (x - mean) / (std + self.eps) + self.beta
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, layer, d_model, drop_ratio):
+        super().__init__()
+        self.layer = layer
+        self.dropout = nn.Dropout(drop_ratio)
+        self.layernorm = LayerNorm(d_model)
+
+    def forward(self, *x):
+        return self.layernorm(x[0] + self.dropout(self.layer(*x)))
+
+class Attention(nn.Module):
+
+    def __init__(self, d_key, drop_ratio, causal):
+        super().__init__()
+        self.scale = math.sqrt(d_key)
+        self.dropout = nn.Dropout(drop_ratio)
+        self.causal = causal
+
+    def forward(self, query, key, value):
+        dot_products = matmul(query, key.transpose(1, 2))
+        if query.dim() == 3 and (self is None or self.causal):
+            tri = torch.ones(key.size(1), key.size(1)).triu(1) * INF
+            if key.is_cuda:
+                tri = tri.cuda(key.get_device())
+            dot_products.data.sub_(tri.unsqueeze(0))
+        return matmul(self.dropout(F.softmax(dot_products / self.scale, dim=2)), value)
+
+class MultiHead(nn.Module):
+
+    def __init__(self, d_key, d_value, n_heads, drop_ratio, causal=False):
+        super().__init__()
+        self.attention = Attention(d_key, drop_ratio, causal=causal)
+        self.wq = Linear(d_key, d_key, bias=False)
+        self.wk = Linear(d_key, d_key, bias=False)
+        self.wv = Linear(d_value, d_value, bias=False)
+        self.wo = Linear(d_value, d_key, bias=False)
+        self.n_heads = n_heads
+
+    def forward(self, query, key, value):
+        query, key, value = self.wq(query), self.wk(key), self.wv(value)
+        query, key, value = (
+            x.chunk(self.n_heads, -1) for x in (query, key, value))
+        return self.wo(torch.cat([self.attention(q, k, v)
+                          for q, k, v in zip(query, key, value)], -1))
+
+class FeedForward(nn.Module):
+
+    def __init__(self, d_model, d_hidden):
+        super().__init__()
+        self.linear1 = Linear(d_model, d_hidden)
+        self.linear2 = Linear(d_hidden, d_model)
+
+    def forward(self, x):
+        return self.linear2(F.relu(self.linear1(x)))
+
+class EncoderLayer(nn.Module):
+
+    def __init__(self, d_model, d_hidden, n_heads, drop_ratio):
+        super().__init__()
+        self.selfattn = ResidualBlock(
+            MultiHead(d_model, d_model, n_heads, drop_ratio),
+            d_model, drop_ratio)
+        self.feedforward = ResidualBlock(FeedForward(d_model, d_hidden),
+                                         d_model, drop_ratio)
+
+    def forward(self, x):
+        return self.feedforward(self.selfattn(x, x, x))
+
+class Encoder(nn.Module):
+
+    def __init__(self, d_model, d_hidden, n_vocab, n_layers, n_heads,
+                 drop_ratio):
+        super().__init__()
+        # self.linear = nn.Linear(d_model*2, d_model)
+        self.layers = nn.ModuleList(
+            [EncoderLayer(d_model, d_hidden, n_heads, drop_ratio)
+             for i in range(n_layers)])
+        self.dropout = nn.Dropout(drop_ratio)
+
+    def forward(self, x, mask=None):
+        # x = self.linear(x)
+        x = x+positional_encodings_like(x)
+        x = self.dropout(x)
+        if mask is not None:
+            x = x*mask
+        encoding = []
+        for layer in self.layers:
+            x = layer(x)
+            if mask is not None:
+                x = x*mask
+            encoding.append(x)
+        return encoding
+
+class Transformer(nn.Module):
+
+    def __init__(self, d_model, n_vocab_src, vocab_trg, d_hidden=2048,
+                 n_layers=6, n_heads=8, drop_ratio=0.1):
+        super().__init__()
+        self.encoder = Encoder(d_model, d_hidden, n_vocab_src, n_layers,
+                               n_heads, drop_ratio)
+
+    def denum(self, data):
+        return ' '.join(self.decoder.vocab.itos[i] for i in data).replace(
+            ' <eos>', '#').replace(' <pad>', '')
+
+    def forward(self, x):
+        encoding = self.encoder(x)
+
+        return encoding[-1], encoding
+
diff --git a/models/i3d/config_test.yaml b/models/i3d/config_test.yaml
new file mode 100644
index 0000000..72ae825
--- /dev/null
+++ b/models/i3d/config_test.yaml
@@ -0,0 +1,27 @@
+# Preprocessing
+clip_length:       64                    # Number of frames within a clip 
+clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
+clip_stride:       0                     # Frame offset between successive frames
+crop_shape:        [224,224]             # (Height, Width) of frame  
+crop_type:         Center                # Type of cropping operation (Random, Central and None)  
+final_shape:       [224,224]             # (Height, Width) of input to be given to CNN
+num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
+random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
+resize_shape:      [230,250]             # (Height, Width) to resize original data 
+subtract_mean:     [123,117,104]                       # Subtract mean (R,G,B) from all frames during preprocessing
+
+# Experiment Setup 
+acc_metric:        Accuracy              # Accuracy metric 
+batch_size:        1                     # Numbers of videos in a mini-batch 
+dataset:           HMDB51                # Name of dataset 
+exp:               I3D            # Experiment name
+json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
+labels:            51                    # Number of total classes in the dataset
+load_type:         train_val                 # Environment selection, to include only training/training and validation/testing dataset
+model:             I3D                   # Name of model to be loaded  
+num_workers:       5                     # Number of CPU worker used to load data
+preprocess:        default               # String argument to select preprocessing type
+pretrained:        'weights/i3d_rgb_imagenet_then_HMDB51_30epochs.pkl'                     # Load pretrained network 
+save_dir:          './results'           # Path to results directory
+seed:              999                   # Seed for reproducibility 
+loss_type:         M_XENTROPY            # Loss function
diff --git a/models/i3d/config_train.yaml b/models/i3d/config_train.yaml
new file mode 100644
index 0000000..c15abe9
--- /dev/null
+++ b/models/i3d/config_train.yaml
@@ -0,0 +1,37 @@
+# Preprocessing
+clip_length:       64                    # Number of frames within a clip 
+clip_offset:       0                     # Frame offset between beginning of video and clip (1st clip only) 
+clip_stride:       1                     # Frame offset between successive frames
+crop_shape:        [224,224]             # (Height, Width) of frame  
+crop_type:         Center                # Type of cropping operation (Random, Central and None)  
+final_shape:       [224,224]             # (Height, Width) of input to be given to CNN
+num_clips:         -1                    # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) 
+random_offset:     0                     # Boolean switch to generate a clip length sized clip from a video 
+resize_shape:      [230,250]             # (Height, Width) to resize original data 
+subtract_mean:     [123,117,104]                       # Subtract mean (R,G,B) from all frames during preprocessing
+
+# Experiment Setup 
+acc_metric:        Accuracy              # Accuracy metric 
+batch_size:        5                     # Numbers of videos in a mini-batch 
+pseudo_batch_loop: 10                     # Pseudo-batch size multiplier to mimic large minibatches 
+dataset:           HMDB51                # Name of dataset 
+epoch:             30                    # Total number of epochs 
+exp:               I3D            # Experiment name
+gamma:             0.1                   # Multiplier with which to change learning rate
+json_path:         /z/dat/HMDB51/        # Path to the json file for the given dataset
+labels:            51                    # Number of total classes in the dataset
+load_type:         train                 # Environment selection, to include only training/training and validation/testing dataset
+loss_type:         M_XENTROPY            # Loss function
+lr:                0.01                # Learning rate
+milestones:        [10, 20]              # Epoch values to change learning rate     
+model:             I3D                   # Name of model to be loaded  
+momentum:          0.9                   # Momentum value in optimizer
+num_workers:       5                     # Number of CPU worker used to load data
+opt:               sgd                   # Name of optimizer
+preprocess:        default               # String argument to select preprocessing type
+pretrained:        1                     # Load pretrained network 
+rerun:             1                     # Number of trials to repeat an experiment
+save_dir:          './results'           # Path to results directory
+seed:              999                   # Seed for reproducibility 
+weight_decay:      0.0005                # Weight decay
+grad_max_norm:     100
diff --git a/models/i3d/i3d.py b/models/i3d/i3d.py
new file mode 100644
index 0000000..4e901b3
--- /dev/null
+++ b/models/i3d/i3d.py
@@ -0,0 +1,447 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+import datasets.preprocessing_transforms as pt 
+
+import numpy as np
+
+import os
+import sys
+from collections import OrderedDict
+
+
+"""
+Code from the implementation of i3d by AJ Piergiovanni: https://github.com/piergiaj/pytorch-i3d
+"""
+
+class MaxPool3dSamePadding(nn.MaxPool3d):
+    
+    def compute_pad(self, dim, s):
+        if s % self.stride[dim] == 0:
+            return max(self.kernel_size[dim] - self.stride[dim], 0)
+        else:
+            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
+
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        #print t,h,w
+        out_t = np.ceil(float(t) / float(self.stride[0]))
+        out_h = np.ceil(float(h) / float(self.stride[1]))
+        out_w = np.ceil(float(w) / float(self.stride[2]))
+        #print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        #print pad_t, pad_h, pad_w
+
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+        return super(MaxPool3dSamePadding, self).forward(x)
+    
+
+class Unit3D(nn.Module):
+
+    def __init__(self, in_channels,
+                 output_channels,
+                 kernel_shape=(1, 1, 1),
+                 stride=(1, 1, 1),
+                 padding=0,
+                 activation_fn=F.relu,
+                 use_batch_norm=True,
+                 use_bias=False,
+                 name='unit_3d',
+                 dilation=1):
+        
+        """Initializes Unit3D module."""
+        super(Unit3D, self).__init__()
+        
+        self._output_channels = output_channels
+        self._kernel_shape = kernel_shape
+        self._stride = stride
+        self._use_batch_norm = use_batch_norm
+        self._activation_fn = activation_fn
+        self._use_bias = use_bias
+        self.name = name
+        self.padding = padding
+        
+        self.conv3d = nn.Conv3d(in_channels=in_channels,
+                                out_channels=self._output_channels,
+                                kernel_size=self._kernel_shape,
+                                stride=self._stride,
+                                padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
+                                bias=self._use_bias,
+                                dilation=dilation)
+        
+        if self._use_batch_norm:
+            self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
+
+    def compute_pad(self, dim, s):
+        if s % self._stride[dim] == 0:
+            return max(self._kernel_shape[dim] - self._stride[dim], 0)
+        else:
+            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
+
+            
+    def forward(self, x):
+        # compute 'same' padding
+        (batch, channel, t, h, w) = x.size()
+        #print t,h,w
+        out_t = np.ceil(float(t) / float(self._stride[0]))
+        out_h = np.ceil(float(h) / float(self._stride[1]))
+        out_w = np.ceil(float(w) / float(self._stride[2]))
+        #print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        #print pad_t, pad_h, pad_w
+
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+
+        pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = F.pad(x, pad)
+
+        x = self.conv3d(x)
+        if self._use_batch_norm:
+            x = self.bn(x)
+        if self._activation_fn is not None:
+            x = self._activation_fn(x)
+        return x
+
+
+
+class InceptionModule(nn.Module):
+    def __init__(self, in_channels, out_channels, name):
+        super(InceptionModule, self).__init__()
+
+        self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
+                         name=name+'/Branch_0/Conv3d_0a_1x1')
+        self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_1/Conv3d_0a_1x1')
+        self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_1/Conv3d_0b_3x3')
+        self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_2/Conv3d_0a_1x1')
+        self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
+                          name=name+'/Branch_2/Conv3d_0b_3x3')
+        self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
+                                stride=(1, 1, 1), padding=0)
+        self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
+                          name=name+'/Branch_3/Conv3d_0b_1x1')
+        self.name = name
+
+    def forward(self, x):    
+        b0 = self.b0(x)
+        b1 = self.b1b(self.b1a(x))
+        b2 = self.b2b(self.b2a(x))
+        b3 = self.b3b(self.b3a(x))
+        return torch.cat([b0,b1,b2,b3], dim=1)
+
+
+class I3D(nn.Module):
+    """Inception-v1 I3D architecture.
+    The model is introduced in:
+        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
+        Joao Carreira, Andrew Zisserman
+        https://arxiv.org/pdf/1705.07750v1.pdf.
+    See also the Inception architecture, introduced in:
+        Going deeper with convolutions
+        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+        http://arxiv.org/pdf/1409.4842v1.pdf.
+    """
+
+    # Endpoints of the model in order. During construction, all the endpoints up
+    # to a designated `final_endpoint` are returned in a dictionary as the
+    # second return value.
+    VALID_ENDPOINTS = (
+        'Conv3d_1a_7x7',
+        'MaxPool3d_2a_3x3',
+        'Conv3d_2b_1x1',
+        'Conv3d_2c_3x3',
+        'MaxPool3d_3a_3x3',
+        'Mixed_3b',
+        'Mixed_3c',
+        'MaxPool3d_4a_3x3',
+        'Mixed_4b',
+        'Mixed_4c',
+        'Mixed_4d',
+        'Mixed_4e',
+        'Mixed_4f',
+        'MaxPool3d_5a_2x2',
+        'Mixed_5b',
+        'Mixed_5c',
+        'Logits',
+        'Predictions',
+    )
+
+    def __init__(self, spatial_squeeze=True,
+                 final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5, **kwargs):
+        """Initializes I3D model instance.
+        Args:
+          num_classes: The number of outputs in the logit layer (default 400, which
+              matches the Kinetics dataset).
+          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
+              before returning (default True).
+          final_endpoint: The model contains many possible endpoints.
+              `final_endpoint` specifies the last endpoint for the model to be built
+              up to. In addition to the output at `final_endpoint`, all the outputs
+              at endpoints up to `final_endpoint` will also be returned, in a
+              dictionary. `final_endpoint` must be one of
+              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
+          name: A string (optional). The name of this module.
+        Raises:
+          ValueError: if `final_endpoint` is not recognized.
+        """
+
+        if final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+        super(I3D, self).__init__()
+        self._num_classes = kwargs['labels'] 
+        self._spatial_squeeze = spatial_squeeze
+        self._final_endpoint = final_endpoint
+        self.logits = None
+
+        self.train_transforms = PreprocessTrain(**kwargs)
+        self.test_transforms  = PreprocessEval(**kwargs)
+
+
+        if self._final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
+
+        self.end_points = {}
+        end_point = 'Conv3d_1a_7x7'
+        self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
+                                            stride=(2, 2, 2), padding=(3,3,3),  name=name+end_point)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'MaxPool3d_2a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'Conv3d_2b_1x1'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'Conv3d_2c_3x3'
+        self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
+                                       name=name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'MaxPool3d_3a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+        
+        end_point = 'Mixed_3b'
+        self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_3c'
+        self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'MaxPool3d_4a_3x3'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4b'
+        self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4c'
+        self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4d'
+        self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4e'
+        self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_4f'
+        self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'MaxPool3d_5a_2x2'
+        self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
+                                                             padding=0)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_5b'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Mixed_5c'
+        self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
+        if self._final_endpoint == end_point: return
+
+        end_point = 'Logits'
+        self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
+                                     stride=(1, 1, 1))
+        self.dropout = nn.Dropout(dropout_keep_prob)
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+
+        
+
+
+
+        self.build()
+
+        if 'pretrained' in kwargs.keys() and kwargs['pretrained']:
+            if 'i3d_pretrained' in kwargs.keys():
+                self._load_checkpoint(kwargs['i3d_pretrained'])
+            else:
+                self._load_pretrained_weights() 
+
+    def _load_pretrained_weights(self):
+        p_dict = torch.load('weights/i3d_rgb_imagenet.pt')
+        s_dict = self.state_dict()
+        for name in p_dict:
+            if name in s_dict.keys():
+                if p_dict[name].shape == s_dict[name].shape:
+                    s_dict[name] = p_dict[name]
+
+        self.load_state_dict(s_dict)
+
+    def _load_checkpoint(self, saved_weights):
+        p_dict = torch.load(saved_weights)['state_dict']
+        s_dict = self.state_dict()
+        for name in p_dict:
+            if name in s_dict.keys():
+                if p_dict[name].shape == s_dict[name].shape:
+                    s_dict[name] = p_dict[name]
+
+        self.load_state_dict(s_dict)
+
+
+
+    def replace_logits(self, num_classes):
+        self._num_classes = num_classes
+        self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
+                             kernel_shape=[1, 1, 1],
+                             padding=0,
+                             activation_fn=None,
+                             use_batch_norm=False,
+                             use_bias=True,
+                             name='logits')
+        
+    
+    def build(self):
+        for k in self.end_points.keys():
+            self.add_module(k, self.end_points[k])
+        
+    def forward(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x) # use _modules to work with dataparallel
+
+        x = self.logits(self.dropout(self.avg_pool(x)))
+
+        if self._spatial_squeeze:
+            logits = x.squeeze(3).squeeze(3)
+        # logits is batch X classes X time, which is what we want to work with
+
+        logits = torch.mean(logits, dim=2)
+        return logits
+        
+
+    def extract_features(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x)
+        return self.avg_pool(x)
+
+class PreprocessTrain(object):
+    """
+    Container for all transforms used to preprocess clips for training in this dataset.
+    """
+    def __init__(self, **kwargs):
+        """
+        Initialize preprocessing class for training set
+        Args:
+            preprocess (String): Keyword to select different preprocessing types            
+            crop_type  (String): Select random or central crop 
+
+        Return:
+            None
+        """
+
+        self.transforms  = []
+        self.transforms1 = []
+        self.preprocess  = kwargs['preprocess']
+        crop_type        = kwargs['crop_type']
+
+
+        self.transforms.append(pt.ResizeClip(**kwargs))
+        
+        if crop_type == 'Random':
+            self.transforms.append(pt.RandomCropClip(**kwargs))
+
+        else:
+            self.transforms.append(pt.CenterCropClip(**kwargs))
+
+        self.transforms.append(pt.SubtractRGBMean(**kwargs))
+        self.transforms.append(pt.RandomFlipClip(direction='h', p=0.5, **kwargs))
+        self.transforms.append(pt.ToTensorClip(**kwargs))
+
+    def __call__(self, input_data):
+        for transform in self.transforms:
+            input_data = transform(input_data)
+
+        return input_data
+
+
+class PreprocessEval(object):
+    """
+    Container for all transforms used to preprocess clips for training in this dataset.
+    """
+    def __init__(self, **kwargs):
+        """
+        Initialize preprocessing class for training set
+        Args:
+            preprocess (String): Keyword to select different preprocessing types            
+            crop_type  (String): Select random or central crop 
+
+        Return:
+            None
+        """
+
+        self.transforms = []
+
+        self.transforms.append(pt.ResizeClip(**kwargs))
+        self.transforms.append(pt.CenterCropClip(**kwargs))
+        self.transforms.append(pt.SubtractRGBMean(**kwargs))
+        self.transforms.append(pt.ToTensorClip(**kwargs))
+
+
+    def __call__(self, input_data):
+        for transform in self.transforms:
+            input_data = transform(input_data)
+
+        return input_data
diff --git a/models/ssd/config_test.yaml b/models/ssd/config_test.yaml
index a521b6d..2b630e2 100644
--- a/models/ssd/config_test.yaml
+++ b/models/ssd/config_test.yaml
@@ -1,7 +1,7 @@
 # Preprocessing
 clip_length:       1                                   # Number of frames within a clip 
 clip_offset:       0                                   # Frame offset between beginning of video and clip (1st clip only) 
-clip_stride:       0                                   # Frame offset between successive frames
+clip_stride:       1                                   # Frame offset between successive frames
 crop_shape:        [112,112]                           # (Height, Width) of frame  
 crop_type:         None                                # Type of cropping operation (Random, Central and None)  
 final_shape:       [300,300]                           # (Height, Width) of input to be given to CNN
diff --git a/parse_args.py b/parse_args.py
index 8c90b31..3e7c8fe 100644
--- a/parse_args.py
+++ b/parse_args.py
@@ -47,9 +47,12 @@ def __init__(self):
         parser.add_argument('--crop_shape',   type=int, nargs=2,  help='(Height, Width) of frame') 
         parser.add_argument('--crop_type',    type=str, help='Type of cropping operation (Random, Center and None)')
         parser.add_argument('--num_clips',    type=int, help='Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips)')
+        parser.add_argument('--scale',        type=float, nargs=2, help='[min scale, max scale] amounts to randomly scale videos for augmentation purposes. scale >1 zooms in and scale <1 zooms out.  ')
+
 
         parser.add_argument('--debug',   type=int, help='Run an experiment but do not save any data or create any folders')
         parser.add_argument('--seed',    type=int, help='Seed for reproducibility')
+        parser.add_argument('--resume',  type=int, help='Flag to resume training or switch to alternate objective after loading')
 
         # Default dict, anything not present is required to exist as an argument or in yaml file
         self.defaults = dict(
@@ -76,7 +79,9 @@ def __init__(self):
             crop_type        = None,
             num_clips        = 1,
             debug            = 0,
-            seed             = 0)                       
+            seed             = 0,
+            scale            = [1,1],
+            resume           = 0)                       
 
 
 
@@ -109,4 +114,12 @@ def get_args(self):
                 if k not in yaml_keys:
                     self.cfg_args[k] = self.defaults[k]
 
+
+        # Force clip_stride to be >= 1 when extracting clips from a video
+        # This represents the # of frames between successive clips 
+        if self.cfg_args['clip_stride'] < 1:
+            self.cfg_args['clip_stride'] = 1
+
+
+
         return self.cfg_args
diff --git a/requirements.txt b/requirements.txt
index 5fa42ed..81fea3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 numpy==1.17.0
 opencv-python==4.1.0.25
-Pillow==6.1.0
+pillow>=6.2.0
 protobuf==3.9.0
 PyYAML==5.1.1
 scipy==1.3.0
@@ -9,3 +9,6 @@ tensorboardX==1.8
 scipy==1.3.0
 torch==1.1.0
 torchvision==0.3.0
+
+torchtext==0.2.1
+spacy==2.1.8
diff --git a/train.py b/train.py
index 1876c5c..d889d84 100644
--- a/train.py
+++ b/train.py
@@ -106,23 +106,26 @@ def train(**args):
         scheduler  = MultiStepLR(optimizer, milestones=args['milestones'], gamma=args['gamma'])
 
         if isinstance(args['pretrained'], str):
-            ckpt = load_checkpoint(args['pretrained'])
+            ckpt        = load_checkpoint(args['pretrained'])
             model.load_state_dict(ckpt)
-            start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1
-            optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer'))
 
-            for quick_looper in range(start_epoch):
-                scheduler.step()
+            if args['resume']:
+                start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1
 
-            # END FOR
+                optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer'))
+                scheduler.step(epoch=start_epoch)
+
+            else:
+                start_epoch = 0
+
+            # END IF 
 
         else:
             start_epoch = 0
 
         # END IF
             
-        model_loss = Losses(device=device, **args)
-        acc_metric = Metrics(**args)
+        model_loss   = Losses(device=device, **args)
         best_val_acc = 0.0
 
     ############################################################################################################################################################################
@@ -139,20 +142,32 @@ def train(**args):
             for step, data in enumerate(train_loader):
                 if step% args['pseudo_batch_loop'] == 0:
                     loss = 0.0
+                    running_batch = 0
                     optimizer.zero_grad()
 
                 # END IF
 
-                x_input       = data['data'].to(device) 
-                annotations   = data['annots'] 
+                x_input       = data['data'] 
+                annotations   = data['annots']
+
+                if isinstance(x_input, torch.Tensor):
+                    mini_batch_size = x_input.shape[0]
+                    outputs = model(x_input.to(device))
+
+                    assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument"
+                else: #Model takes several inputs in forward function 
+                    mini_batch_size = x_input[0].shape[0] #Assuming the first element contains the true data input 
+                    for i, item in enumerate(x_input):
+                        if isinstance(item, torch.Tensor):
+                            x_input[i] = item.to(device)
+                    outputs = model(*x_input)
 
-                assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument"
-                outputs = model(x_input)
                 loss    = model_loss.loss(outputs, annotations)
-                loss    = loss * args['batch_size']
+                loss    = loss * mini_batch_size 
                 loss.backward()
 
-                running_loss += loss.item()
+                running_loss  += loss.item()
+                running_batch += mini_batch_size
 
                 if np.isnan(running_loss):
                     import pdb; pdb.set_trace()
@@ -167,26 +182,37 @@ def train(**args):
                     # END FOR
                 
                     # Add Loss Element
-                    writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/args['batch_size'], epoch*len(train_loader) + step)
+                    writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/mini_batch_size, epoch*len(train_loader) + step)
 
                 # END IF
 
                 if ((epoch*len(train_loader) + step+1) % 100 == 0):
-                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/args['batch_size']))
+                    print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/mini_batch_size))
 
                 # END IF
 
                 if (epoch * len(train_loader) + (step+1)) % args['pseudo_batch_loop'] == 0 and step > 0:
                     # Apply large mini-batch normalization
                     for param in model.parameters():
-                        param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size'])
-                    optimizer.step()
+                        if param.requires_grad:
+                            param.grad *= 1./float(running_batch)
 
+                    # END FOR
+                    
+                    # Apply gradient clipping
+                    if ("grad_max_norm" in args) and float(args['grad_max_norm'] > 0):
+                        nn.utils.clip_grad_norm_(model.parameters(),float(args['grad_max_norm']))
+
+                    optimizer.step()
+                    running_batch = 0
 
                 # END IF
     
 
             # END FOR: Epoch
+            
+            scheduler.step(epoch=epoch)
+            print('Schedulers lr: %f', scheduler.get_lr()[0])
 
             if not args['debug']:
                 # Save Current Model
@@ -195,14 +221,13 @@ def train(**args):
    
             # END IF: Debug
 
-            scheduler.step(epoch=epoch)
-            print('Schedulers lr: %f', scheduler.get_lr()[0])
-
             ## START FOR: Validation Accuracy
             running_acc = []
-            running_acc = valid(valid_loader, running_acc, model, device, acc_metric)
+            running_acc = valid(valid_loader, running_acc, model, device)
+
             if not args['debug']:
-                writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(valid_loader) + step)
+                writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(train_loader) + step)
+
             print('Accuracy of the network on the validation set: %f %%\n' % (100.*running_acc[-1]))
 
             # Save Best Validation Accuracy Model Separately
@@ -226,16 +251,27 @@ def train(**args):
             # Close Tensorboard Element
             writer.close()
 
-def valid(valid_loader, running_acc, model, device, acc_metric):
+def valid(valid_loader, running_acc, model, device):
+    acc_metric = Metrics(**args)
     model.eval()
-    
+
     with torch.no_grad():
         for step, data in enumerate(valid_loader):
-            x_input     = data['data'].to(device)
+            x_input     = data['data']
             annotations = data['annots'] 
-            outputs     = model(x_input)
+
+            if isinstance(x_input, torch.Tensor):
+                outputs = model(x_input.to(device))
+            else:
+                for i, item in enumerate(x_input):
+                    if isinstance(item, torch.Tensor):
+                        x_input[i] = item.to(device)
+                outputs = model(*x_input)
         
             running_acc.append(acc_metric.get_accuracy(outputs, annotations))
+
+            if step % 100 == 0:
+                print('Step: {}/{} | validation acc: {:.4f}'.format(step, len(valid_loader), running_acc[-1]))
     
         # END FOR: Validation Accuracy
 
@@ -250,6 +286,8 @@ def valid(valid_loader, running_acc, model, device, acc_metric):
     # For reproducibility
     torch.backends.cudnn.deterministic = True
     torch.manual_seed(args['seed'])
-    #np.random.seed(args['seed']+1)
+
+    if not args['resume']:
+        np.random.seed(args['seed'])
 
     train(**args)
diff --git a/weights/download_weights.sh b/weights/download_weights.sh
index 3cbfec6..7fa3d64 100755
--- a/weights/download_weights.sh
+++ b/weights/download_weights.sh
@@ -2,9 +2,6 @@
 
 #wget -O [saved_file_name] [direct_download_link]
 
-#GoTurn
-wget -O ./weights/goturn.pth.tar https://umich.box.com/shared/static/src6rfm4lpn0v3t4l26d6u0v4ixdwem5.tar
-
 #SSD
 wget -O ./weights/ssd300_mAP_77.43_v2.pkl https://umich.box.com/shared/static/jszcnnwcvscfyqe3o81xy8qzfbsc20vo.pkl
 
@@ -13,3 +10,12 @@ wget -O ./weights/c3d-pretrained.pth https://umich.box.com/shared/static/znmyt8u
 
 #C3D Mean
 wget -O ./weights/sport1m_train16_128_mean.npy https://umich.box.com/shared/static/ppbnldsa5rty615osdjh2yi8fqcx0a3b.npy 
+
+#YC2BB-Full model
+wget -O ./weights/yc2bb_full-model.pth https://umich.box.com/shared/static/5ukbdcawryzkkq4r789z0src6u6uvg3u.pth 
+
+#I3D pretrained on ImageNet and then Kinetics by original authors
+wget -O ./weights/i3d_rgb_imagenet.pt https://umich.box.com/shared/static/5m6dwwepzdcw3kjhx7s0peb59lbcde0s.pt
+
+#I3D pretrained on ImageNet, Kinetics, then on HMDB51 in ViP
+wget -O ./weights/i3d_rgb_imagenet_then_HMDB51_30epochs.pkl https://umich.box.com/shared/static/x8x83sw4htidxsxgtus9nt00f383mmm7.pkl