From a5914b5a2de166903410da2b99b7a866307e39aa Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 9 Aug 2019 14:42:58 -0400 Subject: [PATCH 01/55] Add script to convert YC2-BB annotations --- datasets/scripts/gen_json_yc2bb.py | 90 ++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 datasets/scripts/gen_json_yc2bb.py diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py new file mode 100644 index 0000000..337a156 --- /dev/null +++ b/datasets/scripts/gen_json_yc2bb.py @@ -0,0 +1,90 @@ +#Convert YC2-BB JSON annotation files to ViP JSON format + +import os +import json + +source_root = '$ANNOTATIONS_ROOT/annotations' #replace this value +target_root = '$JSON_TARGET_ROOT' #replace this value +#Link to videos sampled at 1 fps +frame_root = '$SAMPLED_FRAMES_ROOT' #replace this value +files = ['yc2_training_vid.json', 'yc2_bb_val_annotations.json', 'yc2_bb_public_test_annotations.json'] + +splits = ['train', 'val', 'test'] +ann_files = [os.path.join(source_root, f) for f in files] + +for split, ann_file in zip(splits, ann_files): + + + #YC2 split names, slightly different + if split == 'train': + split_name = 'training' + elif split == 'val': + split_name = 'validation' + else: + split_name = 'testing' + + with open(ann_file) as f: + ann_json_data = json.load(f) + + yc2_json_data = ann_json_data['database'] + json_data = [] + + for vid_name in yc2_json_data.keys(): + frm_height = yc2_json_data[vid_name]['rheight'] + frm_width = yc2_json_data[vid_name]['rwidth'] + recipe_type = yc2_json_data[vid_name]['recipe_type'] + yc2_segments = yc2_json_data[vid_name]['segments'] + + #Loop through segments, YC2 breaks down all each video into segment clips + for seg,item in sorted(yc2_segments.items()): + base_path = os.path.join(frame_root, split_name, recipe_type, vid_name, str(seg).zfill(2)) + frames = [] + if 'objects' in item: #validation or testing file + num_objs = len(item['objects']) + num_frames = len(item['objects'][0]['boxes']) + + #Loop through frames + for f in range(num_frames): + frame = {} + objs = [] + + #Loop through objects + for track_id in range(num_objs): + obj = item['objects'][track_id] + + cls_name = obj['label'] + box = obj['boxes'][f] + + if len(box) == 0: #No annotations + continue + + xmin = box['xtl'] + ymin = box['ytl'] + xmax = box['xbr'] + ymax = box['ybr'] + + outside = box['outside'] #outside or inside of frame + occluded = box['occluded'] + + objs.append({'trackid':track_id, 'c':cls_name, 'occ':occluded, 'outside':outside, 'bbox':[xmin, ymin, xmax, ymax]}) + + frame['img_path'] = os.path.join(base_path, str(seg).zfill(2), str(f).zfill(2)+'.jpg') + frame['objs'] = objs + frame['seg'] = seg + frames.append(frame) + else: #training annotation file + frame = {} + objs = [] + + frame['sentence'] = yc2_segments[seg]['sentence'] + frame['objs'] = objs + frame['seg'] = seg + frames.append(frame) + + json_data.append({'frames':frames, 'base_path':base_path, 'frame_size':[frm_width, frm_height], 'recipe_type':recipe_type}) + + target_file = os.path.join(target_root, split+'.json') + print('Writing out to: {}'.format(target_file)) + with open(target_file, 'w') as f: + json.dump(json_data, f) + From 551cdc22618ed9c0d61bf2fa480c463eff01f529 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sat, 10 Aug 2019 17:54:58 -0400 Subject: [PATCH 02/55] YC2BB dataset configured for validation and testing splits --- datasets/YC2BB.py | 230 +++++++++++++++++++++++++++++ datasets/scripts/gen_json_yc2bb.py | 8 +- 2 files changed, 232 insertions(+), 6 deletions(-) create mode 100644 datasets/YC2BB.py diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py new file mode 100644 index 0000000..6a54ae0 --- /dev/null +++ b/datasets/YC2BB.py @@ -0,0 +1,230 @@ +#Adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text +import torch +from .abstract_datasets import DetectionDataset +from PIL import Image +import cv2 +import os +import csv +import numpy as np + +import torchtext + +class YC2BB(DetectionDataset): + ''' + YouCook2-Bounding Boxes dataset. Used in weakly-supervised video object grounding task + Paper: https://arxiv.org/pdf/1805.02834.pdf + ''' + def __init__(self, *args, **kwargs): + super(YC2BB, self).__init__(*args, **kwargs) + + #Define the following configuration parameters in your config_*.yaml file + #Or as a system arg + class_file = kwargs['yc2bb_class_file'] + num_proposals = kwargs['yc2bb_num_proposals'] + rpn_proposal_root = kwargs['yc2bb_rpn_proposal_root'] + roi_pooled_feat_root = kwargs['yc2bb_roi_pooled_feat_root'] + + self.load_type = kwargs['load_type'] + + self.max_objects = 15 + self.class_dict = _get_class_labels(class_file) + if self.load_type=='train': + self.transforms = kwargs['model_obj'].train_transforms + else: + self.transforms = kwargs['model_obj'].test_transforms + + sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type) + + assert(len(sentences_proc) == len(segments_tuple)) + + #YC2 split names, slightly different + split_to_split = {'train':'training','val':'validation','test':'testing'} + self.yc2_split = split_to_split[self.load_type] + + # read rpn object proposals + self.rpn_dict = {} + self.rpn_chunk = [] + + total_num_proposals = 100 # always load all the proposals we have + rpn_lst_file = os.path.join(rpn_proposal_root, self.yc2_split+'-box-'+str(total_num_proposals)+'.txt') + rpn_chunk_file = os.path.join(rpn_proposal_root, self.yc2_split+'-box-'+str(total_num_proposals)+'.pth') + key_counter = len(self.rpn_dict) + with open(rpn_lst_file) as f: + rpn_lst = f.readline().split(',') + self.rpn_dict.update({r.strip():(i+key_counter) for i,r in enumerate(rpn_lst)}) + + self.rpn_chunk.append(torch.load(rpn_chunk_file)) + + self.rpn_chunk = torch.cat(self.rpn_chunk).cpu() + assert(self.rpn_chunk.size(0) == len(self.rpn_dict)) + assert(self.rpn_chunk.size(2) == 4) + + self.num_proposals = num_proposals + self.roi_pooled_feat_root = roi_pooled_feat_root + + ''' + with open(self.gt_box_file, 'r') as f: + self.data_all = json.load(f) + + # read gt bounding boxes O x T/25 x (id, ytl, xtl, ybr, xbr) + # coordinates are 0-indexed + for i, t in enumerate(segments_tuple): + vid = t[2] + seg = str(int(t[3])) + + # if video has no annotations, continue + if not vid in self.data_all['database']: + continue + + # check if ground truth bounding box exists for segment + if seg in self.data_all['database'][vid]['segments'].keys(): + s = sentences_proc[i] + inc_flag = 0 + obj_label = [] + for w in s: + if self.class_dict.get(w, -1) >= 0: + obj_label.append(self.class_dict[w]) + inc_flag = 1 + + if inc_flag: + self.sample_lst.append((t, obj_label)) + + print('# of segments for {}: {}, percentage in the raw data: {:.2f}'.format( + split_lst, len(self.sample_lst), len(self.sample_lst)/len(sentences_proc))) + ''' + + def __getitem__(self, idx): + vid_info = self.samples[idx] + + base_path = vid_info['base_path'] + width, height = vid_info['frame_size'] + num_frames_1fps = len(vid_info['frames']) + rec = base_path.split('/')[-3] + vid = base_path.split('/')[-2] + seg = base_path.split('/')[-1] + + bbox_data = np.zeros((self.clip_length, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax] + labels = np.zeros((self.clip_length, self.max_objects))-1 + + for frame_ind in range(num_frames_1fps): + frame = vid_info['frames'][frame_ind] + #frame_path = frame['img_path'] + num_objs = len(frame['objs']) + obj_label = np.zeros((num_objs))-1 #List all unique class ids in entire segment + + # Extract bbox and label data from video info + for obj_ind, obj in enumerate(frame['objs']): + label = self.class_dict[obj['c']] + obj_label[obj_ind] = label + + if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated + bbox_data[frame_ind, trackid] = [label, -1, -1, -1, -1] + else: + trackid = obj['trackid'] + obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] + difficult = obj['difficult'] + + bbox_data[frame_ind, trackid, :] = [label] + obj_bbox + labels[frame_ind, trackid] = label + diff_labels[frame_ind, trackid] = difficult + + #input_data.append(cv2.imread(os.path.join(base_path, frame_path), cv2.IMREAD_COLOR)[:,:,(2,1,0)]) + + obj_label = torch.from_numpy(obj_label) + num_frames = num_frames_1fps * 25 #video sampled at 25 fps + + ''' + if self.vis_output: + image_path = os.path.join(self.image_root, split, rec, vid, seg) + img_notrans = [] + for i in range(num_frames): + img_notrans.append(self.spatial_transform_notrans(self.loader(os.path.join(image_path, '{:04d}.jpg'.format(i+1))))) + img_notrans = torch.stack(img_notrans, dim=1) # 3, T, H, W + else: + # no need to load raw images + img_notrans = torch.zeros(3, num_frames, 1, 1) # dummy + ''' + + # rpn object propoals + rpn = [] + x_rpn = [] + frm=1 + + feat_name = vid+'_'+seg+'.pth' + img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg' + x_rpn = torch.load(os.path.join(self.roi_pooled_feat_root, self.yc2_split, feat_name)) + while self.rpn_dict.get(img_name, -1) > -1: + ind = self.rpn_dict[img_name] + rpn.append(self.rpn_chunk[ind]) + frm+=1 + img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg' + + rpn = torch.stack(rpn) # number of frames x number of proposals per frame x 4 + rpn = rpn[:, :self.num_proposals, :] + + x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals + x_rpn = x_rpn[:, :, :self.num_proposals] + + rpn_original = rpn-1 # convert to 1-indexed + + # normalize coordidates to 0-1 + # coordinates are 1-indexed: (x_tl, y_tl, x_br, y_br) + rpn[:, :, 0] = (rpn[:, :, 0]-0.5)/width + rpn[:, :, 2] = (rpn[:, :, 2]-0.5)/width + rpn[:, :, 1] = (rpn[:, :, 1]-0.5)/height + rpn[:, :, 3] = (rpn[:, :, 3]-0.5)/height + + assert(torch.max(rpn) <= 1) + + vis_name = '_-_'.join((self.yc2_split, rec, vid, seg)) + + ret_dict = dict() + ret_dict['data'] = (x_rpn, obj_label) + + annot_dict = dict() + annot_dict['box'] = bbox_data + annot_dict['box_label'] = labels + annot_dict['rpn'] = rpn + annot_dict['rpn_original'] = rpn_original + annot_dict['vis_name'] = vis_name + ret_dict['annots'] = annot_dict + + return ret_dict + +def _get_segments_and_sentences(data, split): + # build vocab and tokenized sentences + text_proc = torchtext.data.Field(sequential=True, tokenize='spacy', + lower=True, batch_first=True) + split_sentences = [] + split_segments = [] + + for dat in data: + rec = dat['base_path'].split('/')[-3] + vid = dat['base_path'].split('/')[-2] + seg = dat['base_path'].split('/')[-1] + frame = dat['frames'][0] + segment_labels = [] + if 'sentence' in frame: # for now, training json file only contains full sentence + segment_labels = frame['sentence'] + else: + for obj in frame['objs']: + segment_labels.append(obj['c']) + split_sentences.append(segment_labels) + split_segments.append((split, rec, vid, str(seg).zfill(2))) #tuple of id (split, vid, seg) + + sentences_proc = list(map(text_proc.preprocess, split_sentences)) # build vocab on train and val + + print('{} sentences in {} split'.format(len(sentences_proc), split)) + + return sentences_proc, split_segments + +def _get_class_labels(class_file): + class_dict = {} # both singular form & plural form are associated with the same label + with open(class_file) as f: + cls = csv.reader(f, delimiter=',') + for i, row in enumerate(cls): + for r in range(1, len(row)): + if row[r]: + class_dict[row[r]] = int(row[0]) + + return class_dict diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py index 337a156..2d0baf6 100644 --- a/datasets/scripts/gen_json_yc2bb.py +++ b/datasets/scripts/gen_json_yc2bb.py @@ -16,12 +16,8 @@ #YC2 split names, slightly different - if split == 'train': - split_name = 'training' - elif split == 'val': - split_name = 'validation' - else: - split_name = 'testing' + split_to_split = {'train':'training','val':'validation','test':'testing'} + split_name = split_to_split[split] with open(ann_file) as f: ann_json_data = json.load(f) From 3745172ad40facb5825a0705c62c389c4ed6e5a6 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Tue, 13 Aug 2019 09:57:28 -0400 Subject: [PATCH 03/55] Update json template frame size, fix num_clip=0 bug --- datasets/abstract_datasets.py | 8 +++++++- datasets/templates/action_recognition_template.json | 4 ++-- datasets/templates/detection_template.json | 6 +++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py index 2d216f0..6e7fe2e 100644 --- a/datasets/abstract_datasets.py +++ b/datasets/abstract_datasets.py @@ -71,6 +71,7 @@ def _extractClips(self, video): if self.num_clips < 0: if len(video) >= self.clip_length: final_video = [video[_idx] for _idx in np.linspace(0, len(video)-1, self.clip_length, dtype='int32')] + final_video = [final_video] else: # Loop if insufficient elements @@ -80,6 +81,7 @@ def _extractClips(self, video): indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')] final_video = [video[_idx] for _idx in indices] + final_video = [final_video] # END IF @@ -103,6 +105,7 @@ def _extractClips(self, video): indices = indices[:self.clip_length] final_video = [video[_idx] for _idx in indices] + final_video = [final_video] # END IF @@ -114,6 +117,7 @@ def _extractClips(self, video): indices = np.arange(indices, indices + self.clip_length).astype('int32') final_video = [video[_idx] for _idx in indices] + final_video = [final_video] else: indices = np.ceil(self.clip_length/float(len(video))) @@ -125,17 +129,19 @@ def _extractClips(self, video): indices = indices[index:index + self.clip_length] final_video = [video[_idx] for _idx in indices] + final_video = [final_video] # END IF else: final_video = video[:self.clip_length] + final_video = [final_video] # END IF # END IF - return [final_video] + return final_video diff --git a/datasets/templates/action_recognition_template.json b/datasets/templates/action_recognition_template.json index dc8acf9..6696368 100644 --- a/datasets/templates/action_recognition_template.json +++ b/datasets/templates/action_recognition_template.json @@ -2,7 +2,6 @@ { "frames (list)": [ { - "frame_size (int, int)": "(WIDTH,HEIGHT)", "img_path (str)": "FRAME_PATH", "actions (list)": [ { @@ -11,6 +10,7 @@ ] } ], + "frame_size (int, int)": "(WIDTH,HEIGHT)", "base_path (str)": "BASE_VID_PATH" } -] \ No newline at end of file +] diff --git a/datasets/templates/detection_template.json b/datasets/templates/detection_template.json index f4b8a42..b8ed90a 100644 --- a/datasets/templates/detection_template.json +++ b/datasets/templates/detection_template.json @@ -2,7 +2,6 @@ { "frames (list)": [ { - "frame_size (int, int)": "(WIDTH,HEIGHT)", "img_path (str)": "FRAME_PATH", "objs (list)": [ { @@ -14,6 +13,7 @@ ] } ], - "base_path (str)": "BASE_VID_PATH" + "base_path (str)": "BASE_VID_PATH", + "frame_size (int, int)": "(WIDTH,HEIGHT)" } -] \ No newline at end of file +] From a5b90e2b4d7e7ad47fd7a8c011b42cf124db5026 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Tue, 13 Aug 2019 11:30:28 -0400 Subject: [PATCH 04/55] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 81c910f..f1d518a 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ A platform for quick and easy development of deep learning networks for recognit * [Development](#development) * [Add a Model](#add-a-model) * [Add a Dataset](#add-a-dataset) -* [Version History](#version-history) +* [FAQ](#faq) ## Configured Datasets | Dataset | Task(s) | @@ -114,3 +114,7 @@ To add a new dataset: * Complete `__init__` and `__getitem__` functions * Example skeleton dataset can be found [here](https://github.com/MichiganCOG/ViP/blob/master/datasets/templates/dataset_template.py) + +### FAQ + +A detailed FAQ can be found on our [wiki](https://github.com/MichiganCOG/ViP/wiki/FAQ). From 9d4bccf43d7f795570437a877f35fd934185f182 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Tue, 13 Aug 2019 11:32:31 -0400 Subject: [PATCH 05/55] Update README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index f1d518a..3231468 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD. +Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) + ## Implemented Models and their performance ### Recognition @@ -88,6 +90,9 @@ Ex: From the root directory of ViP, train the action recognition network C3D on ``` python train.py --cfg_file models/c3d/config_train.yaml ``` + +Additional examples can be found on our [wiki.](https://github.com/MichiganCOG/ViP/wiki) + ## Development New models and datasets can be added without needing to rewrite any training, evaluation, or data loading code. @@ -103,6 +108,8 @@ To add a new model: Examples of previously implemented models can be found [here](https://github.com/MichiganCOG/ViP/tree/master/models). +Additional information can be found on our [wiki.](https://github.com/MichiganCOG/ViP/wiki) + ### Add a Dataset To add a new dataset: @@ -114,6 +121,7 @@ To add a new dataset: * Complete `__init__` and `__getitem__` functions * Example skeleton dataset can be found [here](https://github.com/MichiganCOG/ViP/blob/master/datasets/templates/dataset_template.py) +Additional information can be found on our [wiki.](https://github.com/MichiganCOG/ViP/wiki) ### FAQ From fbd0a83601589abf3795fc6a7514eae645164c95 Mon Sep 17 00:00:00 2001 From: Madan Date: Tue, 13 Aug 2019 16:01:14 -0400 Subject: [PATCH 06/55] feature extraction feature trial --- datasets/KTH.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++ eval.py | 27 +++++++++++++++-- models/c3d/c3d.py | 5 ++- 3 files changed, 106 insertions(+), 3 deletions(-) create mode 100644 datasets/KTH.py diff --git a/datasets/KTH.py b/datasets/KTH.py new file mode 100644 index 0000000..95995ef --- /dev/null +++ b/datasets/KTH.py @@ -0,0 +1,77 @@ +import torch +from .abstract_datasets import RecognitionDataset +from PIL import Image +import cv2 +import os +import numpy as np +from torchvision import transforms + +class KTH(RecognitionDataset): + def __init__(self, *args, **kwargs): + """ + Initialize KTH class + Args: + load_type (String): Select training or testing set + resize_shape (Int): [Int, Int] Array indicating desired height and width to resize input + crop_shape (Int): [Int, Int] Array indicating desired height and width to crop input + final_shape (Int): [Int, Int] Array indicating desired height and width of input to deep network + preprocess (String): Keyword to select different preprocessing types + + Return: + None + """ + super(KTH, self).__init__(*args, **kwargs) + + self.load_type = kwargs['load_type'] + self.resize_shape = kwargs['resize_shape'] + self.crop_shape = kwargs['crop_shape'] + self.final_shape = kwargs['final_shape'] + self.preprocess = kwargs['preprocess'] + + if self.load_type=='train': + self.transforms = kwargs['model_obj'].train_transforms + + else: + self.transforms = kwargs['model_obj'].test_transforms + + + def __getitem__(self, idx): + vid_info = self.samples[idx] + base_path = vid_info['base_path'] + + input_data = [] + vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 + labels = np.zeros((self.clip_length))-1 + input_data = [] + + for frame_ind in range(len(vid_info['frames'])): + frame_path = os.path.join(base_path, vid_info['frames'][frame_ind]['img_path']) + + for frame_labels in vid_info['frames'][frame_ind]['actions']: + labels[frame_ind] = frame_labels['action_class'] + + # Load frame image data and preprocess image accordingly + input_data.append(cv2.imread(frame_path)[...,::-1]/1.) + + + # Preprocess data + vid_data = self.transforms(input_data) + labels = torch.from_numpy(labels).float() + + # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) + vid_data = vid_data.permute(3, 0, 1, 2) + + ret_dict = dict() + ret_dict['data'] = vid_data + + annot_dict = dict() + annot_dict['labels'] = labels + + ret_dict['annots'] = annot_dict + + return ret_dict + + +#dataset = HMDB51(json_path='/z/dat/HMDB51', dataset_type='train', clip_length=100, num_clips=0) +#dat = dataset.__getitem__(0) +#import pdb; pdb.set_trace() diff --git a/eval.py b/eval.py index 2d5ed2e..41ab417 100644 --- a/eval.py +++ b/eval.py @@ -67,6 +67,9 @@ def eval(**args): if args['load_type'] == 'train_val': eval_loader = loader['valid'] + elif args['load_type'] == 'train': + eval_loader = loader['train'] + elif args['load_type'] == 'test': eval_loader = loader['test'] @@ -88,14 +91,28 @@ def eval(**args): # Setup Model To Evaluate model.eval() + ret_data = None + ret_labels = None + with torch.no_grad(): for step, data in enumerate(eval_loader): x_input = data['data'].to(device) annotations = data['annots'] - outputs = model(x_input) + outputs = model(x_input, features=True) + + if ret_data is None: + ret_data = outputs.cpu().numpy() + ret_labels = annotations['labels'].cpu().numpy()[:, 0] + + else: + ret_data = np.vstack((ret_data, outputs.cpu().numpy())) + ret_labels = np.hstack((ret_labels, annotations['labels'].cpu().numpy()[:, 0])) - acc = acc_metric.get_accuracy(outputs, annotations) + # END IF + + + #acc = acc_metric.get_accuracy(outputs, annotations) if step % 100 == 0: print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc)) @@ -103,6 +120,12 @@ def eval(**args): print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc)) if not args['debug']: + ret_dict = {} + ret_dict['data'] = ret_data + ret_dict['labels'] = ret_labels + import scipy.io as sio + sio.savemat(args['load_type']+'_'+args['dataset']+'.mat', ret_dict) + writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc) # Close Tensorboard Element writer.close() diff --git a/models/c3d/c3d.py b/models/c3d/c3d.py index 69e4a74..387ffd7 100644 --- a/models/c3d/c3d.py +++ b/models/c3d/c3d.py @@ -55,7 +55,7 @@ def __init__(self, **kwargs): if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']: self.__load_pretrained_weights() - def forward(self, x, labels=False): + def forward(self, x, labels=False, features=False): x = self.relu(self.conv1(x)) x = self.pool1(x) @@ -78,6 +78,9 @@ def forward(self, x, labels=False): x = self.relu(self.fc6(x)) + if features: + return x + x = self.dropout(x) x = self.relu(self.fc7(x)) x = self.dropout(x) From 252021ce6f9ab13c1ddd9746647df6a709031a82 Mon Sep 17 00:00:00 2001 From: Stephan Lemmer Date: Thu, 15 Aug 2019 09:43:09 -0400 Subject: [PATCH 07/55] Adding gradient clipping (issue #5) --- config_default_example.yaml | 1 + train.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/config_default_example.yaml b/config_default_example.yaml index 6094268..52a0f28 100644 --- a/config_default_example.yaml +++ b/config_default_example.yaml @@ -37,3 +37,4 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay +grad_max_norm: 0.1 # Norm for gradient clipping diff --git a/train.py b/train.py index 1876c5c..e4845bb 100644 --- a/train.py +++ b/train.py @@ -180,6 +180,10 @@ def train(**args): # Apply large mini-batch normalization for param in model.parameters(): param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size']) + + # Apply gradient clipping + if ("grad_max_norm" in args) and float(args['grad_max_norm'] > 0): + nn.utils.clip_grad_norm_(model.parameters(),float(args['grad_max_norm'])) optimizer.step() From 1638e50ab8e2de4f763d10644c9f39923bb0d720 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 15 Aug 2019 14:51:28 -0400 Subject: [PATCH 08/55] Do not scale non-existent gradients --- train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train.py b/train.py index e4845bb..eab0b2c 100644 --- a/train.py +++ b/train.py @@ -179,7 +179,8 @@ def train(**args): if (epoch * len(train_loader) + (step+1)) % args['pseudo_batch_loop'] == 0 and step > 0: # Apply large mini-batch normalization for param in model.parameters(): - param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size']) + if param.requires_grad: + param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size']) # Apply gradient clipping if ("grad_max_norm" in args) and float(args['grad_max_norm'] > 0): From 2860dd39d1ad7ec6bd84786e7f08b00f7ea64009 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 15 Aug 2019 14:53:13 -0400 Subject: [PATCH 09/55] set default grad_max_norm to 0 --- config_default_example.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config_default_example.yaml b/config_default_example.yaml index 52a0f28..4dad7e4 100644 --- a/config_default_example.yaml +++ b/config_default_example.yaml @@ -20,6 +20,7 @@ debug: 0 # If True, do not plot, save, or create epoch: 30 # Total number of epochs exp: exp # Experiment name gamma: 0.1 # Multiplier with which to change learning rate +grad_max_norm: 0 # Norm for gradient clipping json_path: /z/dat/HMDB51/ # Path to the json file for the given dataset labels: 51 # Number of total classes in the dataset load_type: train # Environment selection, to include only training/training and validation/testing dataset @@ -37,4 +38,3 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay -grad_max_norm: 0.1 # Norm for gradient clipping From a54b374c8e88cd717c328274bc7f54c97643a2ac Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 16 Aug 2019 18:04:28 -0400 Subject: [PATCH 10/55] Tweak preprocessing functions to operate on point coordinates & add hand keypoint dataset --- datasets/Manual_Hands.py | 125 +++++++++++++++++++++++++++ datasets/preprocessing_transforms.py | 113 ++++++++++++++++++++---- 2 files changed, 220 insertions(+), 18 deletions(-) create mode 100644 datasets/Manual_Hands.py diff --git a/datasets/Manual_Hands.py b/datasets/Manual_Hands.py new file mode 100644 index 0000000..d5c1a00 --- /dev/null +++ b/datasets/Manual_Hands.py @@ -0,0 +1,125 @@ +import torch +import torchvision +from .abstract_datasets import DetectionDataset +import cv2 +import os +import numpy as np +import json + +class Manual_Hands(DetectionDataset): + """ + Manually-annotated keypoints on hands for pose estimation. + Includes images from The MPII Human Pose and New Zealand Sign Language (NZSL) datasets + + Source: https://arxiv.org/1704.07809 + """ + def __init__(self, *args, **kwargs): + super(Manual_Hands, self).__init__(*args, **kwargs) + + self.load_type = kwargs['load_type'] + self.json_path = kwargs['json_path'] + + # Maximum number of annotated object present in a single frame in entire dataset + # Dictates the return size of annotations in __getitem__ + self.max_objects = 1 + self.sigma = 3.0 + self.stride = 8 #effective stride of the entire network + + if self.load_type=='train': + self.transforms = kwargs['model_obj'].train_transforms + + else: + self.transforms = kwargs['model_obj'].test_transforms + + #Adapted from: https://github.com/namedBen/Convolutional-Pose-Machines-Pytorch + def gaussian_kernel(self, size_w, size_h, center_x, center_y, sigma): + #Outputs a gaussian heat map on defined point + gridy, gridx = torch.meshgrid(torch.arange(0,size_h,dtype=torch.float), torch.arange(0,size_w,dtype=torch.float)) + D2 = (gridx - center_x)**2 + (gridy - center_y)**2 + + return torch.exp(-0.5 * D2 / sigma**2) + + def __getitem__(self, idx): + vid_info = self.samples[idx] + + base_path = vid_info['base_path'] + vid_size = vid_info['frame_size'] + + input_data = [] + vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 + bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1 + hand_pts_data = np.zeros((self.clip_length, self.max_objects, 21, 3))-1 + labels = np.zeros((self.clip_length, self.max_objects))-1 + occlusions = np.zeros((self.clip_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points + + for frame_ind in range(len(vid_info['frames'])): + frame = vid_info['frames'][frame_ind] + width, height = vid_info['frame_size'] + frame_path = frame['img_path'] + + # Extract bbox and label data from video info + for obj in frame['objs']: + #trackid = obj['trackid'] #Let's ignore trackid for now, only one annotation per image + trackid = 0 + label = 1 if obj['c'] == 'left' else 0 #1: left hand, 0: right hand + occluded = obj['occ'] + obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] + body_pts = obj['body_pts'] #16 points (x,y,valid) + hand_pts = obj['hand_pts'] #21 points (x,y,valid) + head_box = obj['head_box'] + head_size = obj['head_size'] #max dim of tightest box around head + hand_ctr = obj['hand_ctr'] + mpii = obj['mpii'] + + #During training square patch is 2.2*B where B is max(obj_bbox) + if self.load_type == 'train': + B = max(obj_bbox[2]-obj_bbox[0], obj_bbox[3]-obj_bbox[1]) + else: #During testing B is 0.7*head_size + B = 0.7*head_size + + hand_size = 2.2*B + xtl = np.clip(int(hand_ctr[0]-hand_size/2), 0, width) + ytl = np.clip(int(hand_ctr[1]-hand_size/2), 0, height) + xbr = np.clip(int(hand_ctr[0]+hand_size/2), 0, width) + ybr = np.clip(int(hand_ctr[1]+hand_size/2), 0, height) + + hand_crop = [xtl, ytl, xbr, ybr] + bbox_data[frame_ind, trackid, :] = obj_bbox + labels[frame_ind, trackid] = label + hand_pts_data[frame_ind, trackid, :] = hand_pts + occlusions[frame_ind, trackid] = occluded + [0] #Add element for background + + # Load frame, convert to RGB from BGR and normalize from 0 to 1 + input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]) + + #Crop hand and resize, perform same transforms to ground truth keypoints + vid_data, hand_pts_coords = self.transforms(input_data, hand_pts_data[:,:,:,:2], hand_crop, labels) + + h_width = int(self.final_shape[1]/self.stride) + h_height = int(self.final_shape[0]/self.stride) + heatmaps = torch.zeros((22, h_width, h_height), dtype=torch.float) #heatmaps for 21 keypoints + background + for i,pts in enumerate(hand_pts_coords[0][0]): + x = pts[0] / self.stride + y = pts[1] / self.stride + heatmaps[i,:,:] = self.gaussian_kernel(h_width, h_height, x, y, self.sigma) + + heatmaps[-1,:,:] = 1 - torch.max(heatmaps[:-1,:,:], dim=0)[0] #Last layer is background + + vid_data = vid_data/255 + # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) + vid_data = vid_data.permute(3, 0, 1, 2) + vid_data = vid_data.squeeze(1) #Remove frame dimension, b/c this is an image dataset + + ret_dict = dict() + ret_dict['data'] = vid_data + annot_dict = dict() + annot_dict['head_size'] = head_size + annot_dict['hand_pts'] = hand_pts_coords + annot_dict['heatmaps'] = heatmaps + annot_dict['labels'] = labels + annot_dict['occ'] = occlusions + annot_dict['frame_path'] = frame_path + annot_dict['frame_size'] = vid_size #width, height + ret_dict['annots'] = annot_dict + + return ret_dict diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 3f5cc10..34513c5 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -123,6 +123,22 @@ def resize_bbox(self, xmin, ymin, xmax, ymax, img_shape, resize_shape): return xmin_new, ymin_new, xmax_new, ymax_new + def resize_pt_coords(self, x, y, img_shape, resize_shape): + # Get relative position for point coords within a frame, after it's resized + + img_h = img_shape[0] + img_w = img_shape[1] + + res_h = resize_shape[0] + res_w = resize_shape[1] + + frac_h = res_h/float(img_h) + frac_w = res_w/float(img_w) + + x_new = (x * frac_w).astype(int) + y_new = (y * frac_h).astype(int) + + return x_new, y_new def __call__(self, clip, bbox=[]): @@ -136,11 +152,16 @@ def __call__(self, clip, bbox=[]): out_clip.append(proc_frame) if bbox!=[]: temp_bbox = np.zeros(bbox[frame_ind].shape)-1 - for class_ind in range(len(bbox[frame_ind])): - if np.array_equal(bbox[frame_ind,class_ind],-1*np.ones(4)): #only annotated objects + for class_ind, box in enumerate(bbox[frame_ind]): + if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects continue - xmin, ymin, xmax, ymax = bbox[frame_ind, class_ind] - proc_bbox = self.resize_bbox(xmin, ymin, xmax, ymax, frame.shape, (self.size_h, self.size_w)) + + if box.shape[-1] == 2: #Operate on point coordinates + proc_bbox = np.stack(self.resize_pt_coords(box[:,0], box[:,1], frame.shape, (self.size_h, self.size_w)),1) + else: #Operate on bounding box + xmin, ymin, xmax, ymax = box + proc_bbox = self.resize_bbox(xmin, ymin, xmax, ymax, frame.shape, (self.size_h, self.size_w)) + temp_bbox[class_ind,:] = proc_bbox out_bbox.append(temp_bbox) @@ -155,7 +176,7 @@ def __call__(self, clip, bbox=[]): class CropClip(PreprocTransform): - def __init__(self, xmin, xmax, ymin, ymax, *args, **kwargs): + def __init__(self, xmin=None, xmax=None, ymin=None, ymax=None, *args, **kwargs): super(CropClip, self).__init__(*args, **kwargs) self.crop_xmin = xmin self.crop_xmax = xmax @@ -165,12 +186,26 @@ def __init__(self, xmin, xmax, ymin, ymax, *args, **kwargs): self.crop_h, self.crop_w = kwargs['crop_shape'] - def _update_bbox(self, xmin, xmax, ymin, ymax): + def _update_bbox(self, xmin, xmax, ymin, ymax, update_crop_shape=False): + ''' + Args: + xmin (Float, shape []): + xmax (Float, shape []): + ymin (Float, shape []): + ymax (Float, shape []): + update_crop_shape (Boolean): Update expected crop shape along with bbox update call + ''' self.crop_xmin = xmin self.crop_xmax = xmax self.crop_ymin = ymin self.crop_ymax = ymax + if update_crop_shape: + self.crop_h = ymax - ymin + self.crop_w = xmax - xmin + + def update_crop_shape(self, crop_h, crop_w): + pass def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, crop_ymax): if (xmin >= crop_xmax) or (xmax <= crop_xmin) or (ymin >= crop_ymax) or (ymax <= crop_ymin): @@ -198,8 +233,15 @@ def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, cro return xmin_new-crop_xmin, ymin_new-crop_ymin, xmax_new-crop_xmin, ymax_new-crop_ymin + def crop_coords(self, x, y, crop_xmin, crop_ymin, crop_xmax, crop_ymax): + if np.any(x >= crop_xmax) or np.any(x <= crop_xmin) or np.any(y >= crop_ymax) or np.any(y <= crop_ymin): + return -1*np.ones(x.shape), -1*np.ones(y.shape) - + x_new = np.clip(x, crop_xmin, crop_xmax) + y_new = np.clip(x, crop_xmin, crop_xmax) + + return x_new-crop_xmin, y_new-crop_xmin + def __call__(self, clip, bbox=[]): out_clip = [] out_bbox = [] @@ -213,11 +255,15 @@ def __call__(self, clip, bbox=[]): if bbox!=[]: temp_bbox = np.zeros(bbox[frame_ind].shape)-1 - for class_ind in range(len(bbox)): - if np.array_equal(bbox[frame_ind,class_ind],-1*np.ones(4)): #only annotated objects + for class_ind, box in enumerate(bbox[frame_ind]): + if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects continue - xmin, ymin, xmax, ymax = bbox[frame_ind, class_ind] - proc_bbox = self.crop_bbox(xmin, ymin, xmax, ymax, self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax) + + if box.shape[-1] == 2: #Operate on point coordinates + proc_bbox = np.stack(self.crop_coords(box[:,0], box[:,1], self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax), 1) + else: #Operate on bounding box + xmin, ymin, xmax, ymax = box + proc_bbox = self.crop_bbox(xmin, ymin, xmax, ymax, self.crop_xmin, self.crop_ymin, self.crop_xmax, self.crop_ymax) temp_bbox[class_ind,:] = proc_bbox out_bbox.append(temp_bbox) @@ -305,6 +351,9 @@ def __init__(self, direction='h', p=0.5, *args, **kwargs): super(RandomFlipClip, self).__init__(*args, **kwargs) self.direction = direction self.p = p + + def _update_p(self, p): + self.p = p def _random_flip(self): flip_prob = np.random.random() @@ -314,17 +363,45 @@ def _random_flip(self): return 1 def _h_flip(self, bbox, frame_size): + width = frame_size[1] bbox_shape = bbox.shape output_bbox = np.zeros(bbox_shape)-1 - for bbox_ind in range(bbox_shape[0]): - xmin, ymin, xmax, ymax = bbox[bbox_ind] - width = frame_size[1] - xmax_new = width - xmin - xmin_new = width - xmax - output_bbox[bbox_ind] = xmin_new, ymin, xmax_new, ymax - return output_bbox + for bbox_ind, box in enumerate(bbox): + if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects + continue + + if box.shape[-1] == 2: #Operate on point coordinates + x = box[:,0] + x_new = width - x + + output_bbox[bbox_ind] = np.stack((x_new,box[:,1]),1) + else: #Operate on bounding box + xmin, ymin, xmax, ymax = box + xmax_new = width - xmin + xmin_new = width - xmax + output_bbox[bbox_ind] = xmin_new, ymin, xmax_new, ymax + return output_bbox def _v_flip(self, bbox, frame_size): + height = frame_size[0] + bbox_shape = bbox.shape + output_bbox = np.zeros(bbox_shape)-1 + for bbox_ind, box in enumerate(bbox): + if np.array_equal(box,-1*np.ones(box.shape)): #only annotated objects + continue + + if box.shape[-1] == 2: #Operate on point coordinates + y = box[:,1] + y_new = height - y + + output_bbox[bbox_ind] = np.stack((box[:,0],y_new),1) + else: #Operate on bounding box + xmin, ymin, xmax, ymax = box + ymax_new = height - ymin + ymin_new = height - ymax + output_bbox[bbox_ind] = xmin, ymin_new, xmax, ymax_new + return output_bbox + bbox_shape = bbox.shape output_bbox = np.zeros(bbox_shape)-1 for bbox_ind in range(bbox_shape[0]): From e3744aba7fecb021eb6150c18f1df961b7913869 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 16 Aug 2019 19:24:56 -0400 Subject: [PATCH 11/55] Add preprocessing test for point coords --- datasets/preprocessing_transforms.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 34513c5..7d674a7 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -738,6 +738,14 @@ def resize_test(self): exp_bbox = np.array([[[0,0,1,2]]]) assert (False not in np.isclose(bbox_out, exp_bbox)) + coord_pts = np.array([[[[1,1], [7,5], [9,6]]]]).astype(float) + _, bbox_out = self.resize(inp, coord_pts) + exp_bbox = np.array([[[[0., 0.], + [3., 3.], + [4., 4.]]]]) + assert (False not in np.isclose(bbox_out, exp_bbox)) + + def crop_test(self): inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float) self.crop._update_bbox(1, 3, 1, 3) From 81045eeef10fa3e975a41612bbd8fc7205b44b0b Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sat, 17 Aug 2019 18:18:49 -0400 Subject: [PATCH 12/55] Add rotations for coordinate points + one small fix --- datasets/preprocessing_transforms.py | 44 ++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 7d674a7..f80a7b6 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -204,9 +204,6 @@ def _update_bbox(self, xmin, xmax, ymin, ymax, update_crop_shape=False): self.crop_h = ymax - ymin self.crop_w = xmax - xmin - def update_crop_shape(self, crop_h, crop_w): - pass - def crop_bbox(self, xmin, ymin, xmax, ymax, crop_xmin, crop_ymin, crop_xmax, crop_ymax): if (xmin >= crop_xmax) or (xmax <= crop_xmin) or (ymin >= crop_ymax) or (ymax <= crop_ymin): return -1, -1, -1, -1 @@ -238,9 +235,9 @@ def crop_coords(self, x, y, crop_xmin, crop_ymin, crop_xmax, crop_ymax): return -1*np.ones(x.shape), -1*np.ones(y.shape) x_new = np.clip(x, crop_xmin, crop_xmax) - y_new = np.clip(x, crop_xmin, crop_xmax) + y_new = np.clip(y, crop_ymin, crop_ymax) - return x_new-crop_xmin, y_new-crop_xmin + return x_new-crop_xmin, y_new-crop_ymin def __call__(self, clip, bbox=[]): out_clip = [] @@ -547,6 +544,38 @@ def _rotate_bbox(self, bboxes, frame_shape, angle): return output_bboxes + def _rotate_coords(self, bboxes, frame_shape, angle): + angle = np.deg2rad(angle) + bboxes_shape = bboxes.shape + output_bboxes = np.zeros(bboxes_shape)-1 + frame_h, frame_w = frame_shape[0], frame_shape[1] + half_h = frame_h/2. + half_w = frame_w/2. + + for bbox_ind in range(bboxes_shape[0]): + x, y = bboxes[bbox_ind].transpose() + ''' + import pdb; pdb.set_trace() + x = np.array([1]) + y = np.array([0]) + half_w = 1 + half_h = 1 + ''' + + pts = (x-half_w, y-half_h) + + pts = self._cart2pol(pts) + + pts = (pts[0], pts[1]-angle) + + pts = self._pol2cart(pts) + + pts = (pts[0]+half_w, pts[1]+half_h) + + output_bboxes[bbox_ind,:,0] = (np.clip(pts[0], 0, frame_w-1)) + output_bboxes[bbox_ind,:,1] = (np.clip(pts[1], 0, frame_h-1)) + + return output_bboxes def __call__(self, clip, bbox=[]): angle = np.random.choice(self.angles) @@ -559,7 +588,10 @@ def __call__(self, clip, bbox=[]): bbox = np.array(bbox) output_bboxes = np.zeros(bbox.shape)-1 for bbox_ind in range(bbox.shape[0]): - output_bboxes[bbox_ind] = self._rotate_bbox(bbox[bbox_ind], clip[0].shape, angle) + if bbox.shape[-1] == 2: + output_bboxes[bbox_ind] = self._rotate_coords(bbox[bbox_ind], clip[0].shape, angle) + else: + output_bboxes[bbox_ind] = self._rotate_bbox(bbox[bbox_ind], clip[0].shape, angle) return output_clip, output_bboxes From 07fb961c96126211a4a1bab7701199299f9ed42a Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 21 Aug 2019 22:17:54 -0400 Subject: [PATCH 13/55] Clean up comments --- datasets/preprocessing_transforms.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index f80a7b6..e98a403 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -554,13 +554,6 @@ def _rotate_coords(self, bboxes, frame_shape, angle): for bbox_ind in range(bboxes_shape[0]): x, y = bboxes[bbox_ind].transpose() - ''' - import pdb; pdb.set_trace() - x = np.array([1]) - y = np.array([0]) - half_w = 1 - half_h = 1 - ''' pts = (x-half_w, y-half_h) From 4a495e9f4f3f44b21c2b06dd6e826ef625981e77 Mon Sep 17 00:00:00 2001 From: Madan Date: Fri, 23 Aug 2019 14:09:15 -0400 Subject: [PATCH 14/55] Added functionality of resume option after loading model, adjusted batch size to match true batch size, numpy seed ajustment and learning rate decay --- config_default_example.yaml | 3 ++- parse_args.py | 1 + train.py | 39 ++++++++++++++++++++++++------------- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/config_default_example.yaml b/config_default_example.yaml index 6094268..49dc68b 100644 --- a/config_default_example.yaml +++ b/config_default_example.yaml @@ -14,7 +14,7 @@ subtract_mean: '' # Subtract mean (R,G,B) from all frames # Experiment Setup acc_metric: Accuracy # Accuracy metric -batch_size: 3 # Numbers of videos in a mini-batch +batch_size: 15 # Numbers of videos in a mini-batch dataset: HMDB51 # Name of dataset debug: 0 # If True, do not plot, save, or create data files epoch: 30 # Total number of epochs @@ -37,3 +37,4 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay +resume: 0 # Flag to resume training or switch to alternate objective after loading diff --git a/parse_args.py b/parse_args.py index 9601473..4501622 100644 --- a/parse_args.py +++ b/parse_args.py @@ -50,6 +50,7 @@ def __init__(self): parser.add_argument('--debug', type=int, help='Run an experiment but do not save any data or create any folders') parser.add_argument('--seed', type=int, help='Seed for reproducibility') + parser.add_argument('--resume', type=int, help='Flag to resume training or switch to alternate objective after loading') # Default dict, anything not present is required to exist as an argument or in yaml file self.defaults = dict( diff --git a/train.py b/train.py index 1876c5c..9082c71 100644 --- a/train.py +++ b/train.py @@ -106,15 +106,16 @@ def train(**args): scheduler = MultiStepLR(optimizer, milestones=args['milestones'], gamma=args['gamma']) if isinstance(args['pretrained'], str): - ckpt = load_checkpoint(args['pretrained']) + ckpt = load_checkpoint(args['pretrained']) model.load_state_dict(ckpt) - start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1 - optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer')) - for quick_looper in range(start_epoch): - scheduler.step() + if args['resume']: + start_epoch = load_checkpoint(args['pretrained'], key_name='epoch') + 1 - # END FOR + optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer')) + scheduler.step(epoch=start_epoch) + + # END IF else: start_epoch = 0 @@ -139,6 +140,7 @@ def train(**args): for step, data in enumerate(train_loader): if step% args['pseudo_batch_loop'] == 0: loss = 0.0 + running_batch = 0 optimizer.zero_grad() # END IF @@ -149,10 +151,11 @@ def train(**args): assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument" outputs = model(x_input) loss = model_loss.loss(outputs, annotations) - loss = loss * args['batch_size'] + loss = loss * outputs.shape[0] loss.backward() - running_loss += loss.item() + running_loss += loss.item() + running_batch += outputs.shape[0] if np.isnan(running_loss): import pdb; pdb.set_trace() @@ -167,21 +170,24 @@ def train(**args): # END FOR # Add Loss Element - writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/args['batch_size'], epoch*len(train_loader) + step) + writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/outputs.shape[0], epoch*len(train_loader) + step) # END IF if ((epoch*len(train_loader) + step+1) % 100 == 0): - print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/args['batch_size'])) + print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/outputs.shape[0])) # END IF if (epoch * len(train_loader) + (step+1)) % args['pseudo_batch_loop'] == 0 and step > 0: # Apply large mini-batch normalization for param in model.parameters(): - param.grad *= 1./float(args['pseudo_batch_loop']*args['batch_size']) - optimizer.step() + param.grad *= 1./float(running_batch) + + # END FOR + optimizer.step() + running_batch = 0 # END IF @@ -201,8 +207,10 @@ def train(**args): ## START FOR: Validation Accuracy running_acc = [] running_acc = valid(valid_loader, running_acc, model, device, acc_metric) + if not args['debug']: - writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(valid_loader) + step) + writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(train_loader) + step) + print('Accuracy of the network on the validation set: %f %%\n' % (100.*running_acc[-1])) # Save Best Validation Accuracy Model Separately @@ -246,10 +254,13 @@ def valid(valid_loader, running_acc, model, device, acc_metric): parse = Parse() args = parse.get_args() + import pdb; pdb.set_trace() # For reproducibility torch.backends.cudnn.deterministic = True torch.manual_seed(args['seed']) - #np.random.seed(args['seed']+1) + + if not args['resume']: + np.random.seed(args['seed']) train(**args) From 95bf52e1332429c6243a7e493344516420ff284d Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 23 Aug 2019 19:54:39 -0400 Subject: [PATCH 15/55] Added affine translation transform on images, bbox, and coordinate points --- datasets/preprocessing_transforms.py | 77 ++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index e98a403..7dd7414 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -590,7 +590,84 @@ def __call__(self, clip, bbox=[]): return output_clip +class RandomTranslateClip(PreprocTransform): + """ + Random horizontal and/or vertical shift on frames in a clip + Shift will be bounded by object bounding box (if given). Meaning, object will always be in view + + Args: + - translate (Tuple) + - max_x (float): maximum absolute fraction for horizontal shift + - max_y (float): maximum absolute fraction for vertical shift + """ + def __init__(self, translate, **kwargs): + super(RandomTranslateClip, self).__init__(**kwargs) + + self.max_x, self.max_y = translate + + assert(self.max_x >= 0.0 and self.max_y >= 0.0) + assert(self.max_x < 1.0 and self.max_y < 1.0) #Cannot shift pass image bounds + + def _shift_frame(self, bbox, frame, tx, ty): + M = np.array([[1, 0, tx],[0, 1, ty]], dtype=np.float) # 2 x 3 transformation matrix + out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0])) + + if bbox is not None: + if box.shape[-1] == 2: #Operate on point coordinates + bbox_h = np.concatenate((box, np.ones((box.shape[0],1))), axis=1).transpose() #homography coords + out_box = M @ bbox_h + else: #Operate on bounding box + bbox_h = np.reshape(bbox, (-1,2)) #x-y coords + bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords + + out_box = M @ bbox_h + out_box = np.reshape(out_box.transpose(), (-1,4)) + + return out_frame, out_box + else: + return out_frame + + def __call__(self, clip, bbox=[]): + out_clip = [] + clip = self._to_numpy(clip) + + frac_x = np.random.rand()*(2*self.max_x)-self.max_x + frac_y = np.random.rand()*(2*self.max_y)-self.max_y + + if bbox != []: + out_bbox = [] + + for frame, box in zip(clip,bbox): + mask = box[:,0] != -1 + img_h, img_w, _ = frame.shape + tx = int(img_w * frac_x) + ty = int(img_h * frac_y) + + #Bound translation amount so all objects remain in scene + if box.shape[-1] == 2: #Operate on point coordinates + tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,0])) + ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,1])) + out_frame, out_box = self._shift_frame(box, frame, tx, ty) + out_box[~mask] = -1*np.ones(2) + + else: #Operate on bounding box + #bbox is bounding box object + tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,2])) + ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,3])) + out_frame, out_box = self._shift_frame(box, frame, tx, ty) + out_box[~mask] = -1*np.ones(4) + + out_clip.append(out_frame) + out_bbox.append(out_box) + + return out_clip, out_bbox + else: + for frame in clip: + img_h, img_w, _ = frame.shape + tx = int(img_w * frac_x) + ty = int(img_h * frac_y) + out_clip.append(self._shift_frame(None, frame, tx, ty)) class SubtractMeanClip(PreprocTransform): def __init__(self, **kwargs): From 5a2c8024e9eb4ed9eda9e47e63e6c92808f7de5b Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Mon, 26 Aug 2019 19:31:23 -0400 Subject: [PATCH 16/55] Tested on coordinate points --- datasets/preprocessing_transforms.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 7dd7414..6eff344 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -613,14 +613,14 @@ def _shift_frame(self, bbox, frame, tx, ty): out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0])) if bbox is not None: - if box.shape[-1] == 2: #Operate on point coordinates - bbox_h = np.concatenate((box, np.ones((box.shape[0],1))), axis=1).transpose() #homography coords - out_box = M @ bbox_h - else: #Operate on bounding box - bbox_h = np.reshape(bbox, (-1,2)) #x-y coords - bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords + bbox_h = np.reshape(bbox, (-1,2)) #x-y coords + bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords - out_box = M @ bbox_h + out_box = M @ bbox_h + + if bbox.shape[-1] == 2: #Operate on point coordinates + out_box = np.reshape(out_box.transpose(), (bbox.shape[0], bbox.shape[1],2)) + else: #Operate on bounding box out_box = np.reshape(out_box.transpose(), (-1,4)) return out_frame, out_box @@ -638,13 +638,13 @@ def __call__(self, clip, bbox=[]): out_bbox = [] for frame, box in zip(clip,bbox): - mask = box[:,0] != -1 img_h, img_w, _ = frame.shape tx = int(img_w * frac_x) ty = int(img_h * frac_y) #Bound translation amount so all objects remain in scene if box.shape[-1] == 2: #Operate on point coordinates + mask = box[:,:,0] != -1 tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,0])) ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,1])) out_frame, out_box = self._shift_frame(box, frame, tx, ty) @@ -652,6 +652,7 @@ def __call__(self, clip, bbox=[]): else: #Operate on bounding box #bbox is bounding box object + mask = box[:,0] != -1 tx = np.clip(tx, np.max(-1*box[mask,0]), np.min(img_w-box[mask,2])) ty = np.clip(ty, np.max(-1*box[mask,1]), np.min(img_h-box[mask,3])) out_frame, out_box = self._shift_frame(box, frame, tx, ty) From 7547e26967b3b25f1dc4c4adbc7e13ba9260e97a Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Mon, 26 Aug 2019 19:35:44 -0400 Subject: [PATCH 17/55] Convert objects back to numpy after ApplyToPIL --- datasets/preprocessing_transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 6eff344..23c5196 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -739,7 +739,7 @@ def __call__(self, clip, bbox=[]): clip = self._to_pil(clip) output_clip = [] for frame in clip: - output_clip.append(self.transform(frame)) + output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy if bbox!=[]: return output_clip, bbox From 1b62a43e3253e76801b9bc12491dfdbaa1e72cf0 Mon Sep 17 00:00:00 2001 From: Madan Date: Thu, 29 Aug 2019 17:31:10 -0400 Subject: [PATCH 18/55] Added extra case for start epoch under resume condition --- train.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index 9082c71..9089435 100644 --- a/train.py +++ b/train.py @@ -115,6 +115,9 @@ def train(**args): optimizer.load_state_dict(load_checkpoint(args['pretrained'], key_name='optimizer')) scheduler.step(epoch=start_epoch) + else: + start_epoch = 0 + # END IF else: @@ -194,6 +197,9 @@ def train(**args): # END FOR: Epoch + scheduler.step(epoch=epoch) + print('Schedulers lr: %f', scheduler.get_lr()[0]) + if not args['debug']: # Save Current Model save_path = os.path.join(save_dir, args['dataset']+'_epoch'+str(epoch)+'.pkl') @@ -201,9 +207,6 @@ def train(**args): # END IF: Debug - scheduler.step(epoch=epoch) - print('Schedulers lr: %f', scheduler.get_lr()[0]) - ## START FOR: Validation Accuracy running_acc = [] running_acc = valid(valid_loader, running_acc, model, device, acc_metric) @@ -254,7 +257,6 @@ def valid(valid_loader, running_acc, model, device, acc_metric): parse = Parse() args = parse.get_args() - import pdb; pdb.set_trace() # For reproducibility torch.backends.cudnn.deterministic = True From e3207b0b6c3205e9d63dda10d4643eb964b1b804 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 30 Aug 2019 09:23:56 -0400 Subject: [PATCH 19/55] Corrected a few things for loading testing & validation data --- datasets/YC2BB.py | 37 ++++++++++++++++-------------- datasets/scripts/gen_json_yc2bb.py | 1 + 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py index 6a54ae0..fb195a3 100644 --- a/datasets/YC2BB.py +++ b/datasets/YC2BB.py @@ -103,33 +103,36 @@ def __getitem__(self, idx): vid = base_path.split('/')[-2] seg = base_path.split('/')[-1] - bbox_data = np.zeros((self.clip_length, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax] - labels = np.zeros((self.clip_length, self.max_objects))-1 + bbox_data = np.zeros((self.clip_length, self.max_objects, 5))-1 #[cls_label, xmin, ymin, xmax ymax] + labels = np.zeros(self.max_objects)-1 - for frame_ind in range(num_frames_1fps): + for frame_ind in range(self.clip_length): frame = vid_info['frames'][frame_ind] #frame_path = frame['img_path'] - num_objs = len(frame['objs']) + num_objs = len(frame['objs']) obj_label = np.zeros((num_objs))-1 #List all unique class ids in entire segment # Extract bbox and label data from video info for obj_ind, obj in enumerate(frame['objs']): - label = self.class_dict[obj['c']] - obj_label[obj_ind] = label + label = self.class_dict[obj['c']] + trackid = obj['trackid'] if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated - bbox_data[frame_ind, trackid] = [label, -1, -1, -1, -1] + bbox_data[frame_ind, trackid] = -1*np.ones(5) else: - trackid = obj['trackid'] - obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] - difficult = obj['difficult'] - - bbox_data[frame_ind, trackid, :] = [label] + obj_bbox - labels[frame_ind, trackid] = label - diff_labels[frame_ind, trackid] = difficult - - #input_data.append(cv2.imread(os.path.join(base_path, frame_path), cv2.IMREAD_COLOR)[:,:,(2,1,0)]) - + if obj['occ'] or obj['outside']: + bbox_data[frame_ind, trackid] = -1*np.ones(5) + else: + obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] + bbox_data[frame_ind, trackid, :] = [label] + obj_bbox + + obj_label[obj_ind] = label + labels[trackid] = label + + #Only keep annotations for valid objects + bbox_data = bbox_data[:, :num_objs] + labels = labels[:num_objs] + obj_label = torch.from_numpy(obj_label) num_frames = num_frames_1fps * 25 #video sampled at 25 fps diff --git a/datasets/scripts/gen_json_yc2bb.py b/datasets/scripts/gen_json_yc2bb.py index 2d0baf6..4f4b8f2 100644 --- a/datasets/scripts/gen_json_yc2bb.py +++ b/datasets/scripts/gen_json_yc2bb.py @@ -52,6 +52,7 @@ box = obj['boxes'][f] if len(box) == 0: #No annotations + objs.append({'trackid':track_id, 'c':cls_name}) continue xmin = box['xtl'] From 26eb766239b392477f32be083ebd34d98ed90c87 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 30 Aug 2019 17:34:26 -0400 Subject: [PATCH 20/55] Reproduce exact results as paper --- datasets/YC2BB.py | 38 ++++-- metrics.py | 122 +++++++++++++++++ models/dvsa/dvsa.py | 147 ++++++++++++++++++++ models/dvsa/dvsa_utils/transformer.py | 190 ++++++++++++++++++++++++++ 4 files changed, 484 insertions(+), 13 deletions(-) create mode 100644 models/dvsa/dvsa.py create mode 100644 models/dvsa/dvsa_utils/transformer.py diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py index fb195a3..0f511f8 100644 --- a/datasets/YC2BB.py +++ b/datasets/YC2BB.py @@ -28,10 +28,12 @@ def __init__(self, *args, **kwargs): self.max_objects = 15 self.class_dict = _get_class_labels(class_file) + ''' if self.load_type=='train': self.transforms = kwargs['model_obj'].train_transforms else: self.transforms = kwargs['model_obj'].test_transforms + ''' sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type) @@ -93,6 +95,10 @@ def __init__(self, *args, **kwargs): split_lst, len(self.sample_lst), len(self.sample_lst)/len(sentences_proc))) ''' + #Reverse-mapping between class index to canonical label name + def _get_class_labels_reverse(self): + return {v:k for k,v in self.class_dict.items()} + def __getitem__(self, idx): vid_info = self.samples[idx] @@ -103,10 +109,10 @@ def __getitem__(self, idx): vid = base_path.split('/')[-2] seg = base_path.split('/')[-1] - bbox_data = np.zeros((self.clip_length, self.max_objects, 5))-1 #[cls_label, xmin, ymin, xmax ymax] + bbox_data = np.zeros((self.max_objects, num_frames_1fps, 5))-1 #[cls_label, xmin, ymin, xmax ymax] labels = np.zeros(self.max_objects)-1 - for frame_ind in range(self.clip_length): + for frame_ind in range(num_frames_1fps): frame = vid_info['frames'][frame_ind] #frame_path = frame['img_path'] num_objs = len(frame['objs']) @@ -118,22 +124,26 @@ def __getitem__(self, idx): trackid = obj['trackid'] if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated - bbox_data[frame_ind, trackid] = -1*np.ones(5) + bbox_data[trackid, frame_ind] = -1*np.ones(5) else: if obj['occ'] or obj['outside']: - bbox_data[frame_ind, trackid] = -1*np.ones(5) + bbox_data[trackid, frame_ind] = -1*np.ones(5) else: obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] - bbox_data[frame_ind, trackid, :] = [label] + obj_bbox + + #re-order to [ymin, xmin, ymax, xmax], rpn proposals are this way I believe + new_order = [1,0,3,2] + obj_bbox = [obj_bbox[i] for i in new_order] + bbox_data[trackid, frame_ind, :] = [label] + obj_bbox obj_label[obj_ind] = label labels[trackid] = label #Only keep annotations for valid objects - bbox_data = bbox_data[:, :num_objs] + bbox_data = bbox_data[:num_objs, :] labels = labels[:num_objs] - obj_label = torch.from_numpy(obj_label) + obj_label = torch.from_numpy(obj_label).long() num_frames = num_frames_1fps * 25 #video sampled at 25 fps ''' @@ -182,14 +192,15 @@ def __getitem__(self, idx): vis_name = '_-_'.join((self.yc2_split, rec, vid, seg)) ret_dict = dict() - ret_dict['data'] = (x_rpn, obj_label) + ret_dict['data'] = [x_rpn, obj_label, self.load_type] annot_dict = dict() - annot_dict['box'] = bbox_data - annot_dict['box_label'] = labels - annot_dict['rpn'] = rpn - annot_dict['rpn_original'] = rpn_original - annot_dict['vis_name'] = vis_name + annot_dict['box'] = bbox_data + annot_dict['box_label'] = obj_label + annot_dict['rpn'] = rpn + annot_dict['rpn_original'] = rpn_original + annot_dict['vis_name'] = vis_name + annot_dict['class_labels_dict'] = self._get_class_labels_reverse() ret_dict['annots'] = annot_dict return ret_dict @@ -231,3 +242,4 @@ def _get_class_labels(class_file): class_dict[row[r]] = int(row[0]) return class_dict + diff --git a/metrics.py b/metrics.py index f4e22b3..0b6b6bd 100644 --- a/metrics.py +++ b/metrics.py @@ -21,6 +21,8 @@ def __init__(self, *args, **kwargs): self.metric_object = MAP(*args, **kwargs) elif self.metric_type == 'SSD_AP': self.metric_object = SSD_AP(*args, **kwargs) + elif self.metric_type == 'Box_Accuracy': + self.metric_object = Box_Accuracy(*args, **kwargs) else: self.metric_type = None @@ -513,3 +515,123 @@ def get_accuracy(self, detections, data): return self.get_AP(self.predictions, self.targets) +class Box_Accuracy(): + """ + Box accuracy computation + + """ + def __init__(self, *args, **kwargs): + from collections import defaultdict + + self.thresh = kwargs['accu_thresh'] + self.fps = kwargs['fps'] + self.test_mode = 1 if kwargs['load_type'] == 'test' else 0 + self.IOU = IOU() + self.ba_score = defaultdict(list) #box accuracy metric + + self.ndata = kwargs['ndata'] + self.count = 0 + + def get_accuracy(self, predictions, data): + attn_weights = predictions + + N = attn_weights.shape[0] + + rpn_batch = data['rpn_original'] + box_batch = data['box'] + obj_batch = data['box_label'] + box_label_batch = obj_batch + vis_name = data['vis_name'] + class_labels_dict = data['class_labels_dict'] + + # fps is the frame rate of the attention map + # both rpn_batch and box_batch have fps=1 + _, T_rp, num_proposals, _ = rpn_batch.size() + _, O, T_gt, _ = box_batch.size() + T_attn = attn_weights.size(2) + + assert(T_rp == T_gt) # both sampled at 1fps + #print('# of frames in gt: {}, # of frames in resampled attn. map: {}'.format(T_gt, np.rint(T_attn/self.fps))) + + hits, misses = [0 for o in range(O)], [0 for o in range(O)] + + results = [] + pos_counter = 0 + neg_counter = 0 + segment_dict = {} #segment dictionary - to output results to JSON file + all_objects = [] + + for o in range(O): + object_dict = {} + if box_label_batch[0, o] not in obj_batch[0, :]: + print('object {} is not grounded!'.format(box_label_batch[0, o])) + continue # don't compute score if the object is not grounded + obj_ind_in_attn = (obj_batch[0, :] == box_label_batch[0, o]).nonzero().squeeze() + if obj_ind_in_attn.numel() > 1: + obj_ind_in_attn = obj_ind_in_attn[0] + else: + obj_ind_in_attn = obj_ind_in_attn.item() + + new_attn_weights = attn_weights[0, obj_ind_in_attn] + _, max_attn_ind = torch.max(new_attn_weights, dim=1) + + # uncomment this for the random baseline + # max_attn_ind = torch.floor(torch.rand(T_attn)*num_proposals).long() + label = class_labels_dict[box_label_batch[0,o].item()] + object_dict = {'label':label} + + boxes = [] + for t in range(T_gt): + if box_batch[0,o,t,0] == -1: # object is outside/non-exist/occlusion + boxes.append({'xtl':-1, 'ytl':-1, 'xbr':-1, 'ybr':-1, 'outside':1, 'occluded':1}) #object is either occluded or outside of frame + neg_counter += 1 + continue + pos_counter += 1 + box_ind = max_attn_ind[int(min(np.rint(t*self.fps), T_attn-1))] + box_coord = rpn_batch[0, t, box_ind, :].view(4) # x_tl, y_tl, x_br, y_br + gt_box = box_batch[0,o,t][torch.Tensor([2,1,4,3]).type(box_batch.type()).long()].view(1,4) # inverse x and y + + if self.IOU.get_accuracy(box_coord, gt_box.float())[0].item() > self.thresh: + hits[o] += 1 + else: + misses[o] += 1 + + xtl = box_coord[0].item() + ytl = box_coord[1].item() + xbr = box_coord[2].item() + ybr = box_coord[3].item() + boxes.append({'xtl':xtl, 'ytl':ytl, 'xbr':xbr, 'ybr':ybr, 'outside':0, 'occluded':0}) + + object_dict['boxes'] = boxes + all_objects.append(object_dict) + + results.append((box_label_batch[0, o].item(), hits[o], misses[o])) + + segment_dict['objects'] = all_objects + #print('percentage of frames with box: {}'.format(pos_counter/(pos_counter+neg_counter))) + + for (i,h,m) in results: + self.ba_score[i].append((h,m)) + + + self.count += N + if self.count < self.ndata: + return -1 + + if self.test_mode: #Annotations for the testing split are not publicly available + return -1 + + ba_final = [] + for k, r in self.ba_score.items(): + cur_hit = 0 + cur_miss = 0 + for v in r: + cur_hit += v[0] + cur_miss += v[1] + + if cur_hit+cur_miss != 0: + #print('BA for {}(...): {:.4f}'.format(k, cur_hit/(cur_hit+cur_miss))) + ba_final.append(cur_hit/(cur_hit+cur_miss)) + + #print('The overall BA is: {:.4f}'.format(np.mean(ba_final))) + return np.mean(ba_final) diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py new file mode 100644 index 0000000..0543036 --- /dev/null +++ b/models/dvsa/dvsa.py @@ -0,0 +1,147 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import math +import numpy as np +from functools import partial +import os + +from models.dvsa.dvsa_utils.transformer import Transformer + +class DVSA(nn.Module): + +# def __init__(self, num_class, input_size=2048, enc_size=128, dropout=0.2, hidden_size=256, n_layers=1, n_heads=4, attn_drop=0.2, num_frm=5, has_loss_weighting=False): + def __init__(self, **kwargs): + super().__init__() + num_class = kwargs['num_class'] + input_size = kwargs['input_size'] + enc_size = kwargs['enc_size'] + dropout = kwargs['dropout'] + hidden_size = kwargs['hidden_size'] + n_layers = kwargs['n_layers'] + n_heads = kwargs['n_heads'] + attn_drop = kwargs['attn_drop'] + num_frm = kwargs['num_frm'] + has_loss_weighting = kwargs['has_loss_weighting'] + + # encode the region feature + self.feat_enc = nn.Sequential( + nn.Linear(input_size, enc_size), + nn.Dropout(p=dropout), + nn.ReLU() + ) + + self.sigmoid = nn.Sigmoid() + + # lookup table for object label embedding + self.obj_emb = nn.Embedding(num_class+1, enc_size) # +1 for the dummy paddings + self.num_class = num_class + + self.obj_interact = Transformer(enc_size, 0, 0, + d_hidden=hidden_size, + n_layers=n_layers, + n_heads=n_heads, + drop_ratio=attn_drop) + + self.obj_interact_fc = nn.Sequential( + nn.Linear(enc_size*2, int(enc_size/2)), + nn.ReLU(), + nn.Linear(int(enc_size/2), 5), # object interaction guidance (always 5 snippets) + nn.Sigmoid() + ) + + self.num_frm = num_frm + self.has_loss_weighting = has_loss_weighting + + if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']: + self._load_pretrained_weights() + + def forward(self, x_o, obj, load_type): + is_evaluate = 1 if load_type[0] == 'test' or load_type[0] == 'val' else 0 + if is_evaluate: + return self.output_attn(x_o, obj) + + x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous() + + x_o = torch.stack([x_o[0], x_o[1], x_o[0]]) + obj = torch.stack([obj[0], obj[0], obj[1]]) + + N, C_out, T, num_proposals = x_o.size() + assert(N == 3) # two pos samples and one neg sample + + # attention + O = obj.size(1) + attn_key = self.obj_emb(obj) + + num_pos_obj = torch.sum(obj[0]0: + tmp.append(obj_attn_emb[:, :, i:(i+1)].expand(1, num_pos_obj, l)) + obj_attn_emb = torch.cat(tmp, 2).squeeze(0) + assert(obj_attn_emb.size(1) == self.num_frm) + + loss_weigh = torch.mean(obj_attn_emb, dim=0) + loss_weigh = torch.cat((loss_weigh, loss_weigh)).unsqueeze(1) + + if self.has_loss_weighting: + # dot-product attention + x_o = x_o.view(N, 1, C_out, T, num_proposals) + attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1, 1)).sum(2)/math.sqrt(C_out)) + + pos_weights = attn_weights[0, :num_pos_obj, :, :] + neg1_weights = attn_weights[1, :num_pos_obj, :, :] + neg2_weights = attn_weights[2, :num_neg_obj, :, :] + + return torch.cat((torch.stack((torch.mean(torch.max(pos_weights, dim=2)[0], dim=0), torch.mean(torch.max(neg1_weights, dim=2)[0], dim=0)), dim=1), + torch.stack((torch.mean(torch.max(pos_weights, dim=2)[0], dim=0), torch.mean(torch.max(neg2_weights, dim=2)[0], dim=0)), dim=1))), loss_weigh + else: + # dot-product attention + x_o = x_o.view(N, 1, C_out, T*num_proposals) + attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1)).sum(2)/math.sqrt(C_out)) + + pos_weights = attn_weights[0, :num_pos_obj, :] + neg1_weights = attn_weights[1, :num_pos_obj, :] + neg2_weights = attn_weights[2, :num_neg_obj, :] + + return torch.stack((torch.stack((torch.mean(torch.max(pos_weights, dim=1)[0]), torch.mean(torch.max(neg1_weights, dim=1)[0]))), + torch.stack((torch.mean(torch.max(pos_weights, dim=1)[0]), torch.mean(torch.max(neg2_weights, dim=1)[0]))))), loss_weigh + + def output_attn(self, x_o, obj): + x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous() + + N, C_out, T, num_proposals = x_o.size() + assert(N == 1) # two pos samples and one neg sample + + # attention + O = obj.size(1) + attn_key = self.obj_emb(obj) + + # dot-product attention + x_o = x_o.view(N, 1, C_out, T*num_proposals) + attn_weights = self.sigmoid((x_o*attn_key.view(N, O, C_out, 1)).sum(2)/math.sqrt(C_out)) + # attn_weights = self.sigmoid((x_e*attn_key.view(N, O, C_out, 1).expand(N, O, C_out, T*num_proposals)).sum(2)) # N, O, T, H*W + + # additive attention + # x_e = x_o.view(N, 1, C_out, T, H*W).contiguous().expand(N, O, C_out, T, H*W) + # attn_e = attn_key.view(N, O, C_out, 1, 1).expand(N, O, C_out, T, H*W) + # attn_weights = self.attn_mlp(torch.cat((x_e, attn_e), dim=2).permute(0,1,3,4,2).contiguous()).squeeze(4) # N, O, T, H*W + + return attn_weights.view(N, O, T, num_proposals) + + def _load_pretrained_weights(self): + state_dict = torch.load('weights/yc2bb_full-model.pth', map_location=lambda storage, location: storage) + + self.load_state_dict(state_dict) + diff --git a/models/dvsa/dvsa_utils/transformer.py b/models/dvsa/dvsa_utils/transformer.py new file mode 100644 index 0000000..38f9e3a --- /dev/null +++ b/models/dvsa/dvsa_utils/transformer.py @@ -0,0 +1,190 @@ +# Originally from https://github.com/salesforce/densecap +""" + Copyright (c) 2018, salesforce.com, inc. + All rights reserved. + SPDX-License-Identifier: BSD-3-Clause + For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause +""" +# Last modified by Luowei Zhou on 07/01/2018 + +import torch +from torch import nn +from torch.nn import functional as F +from torch.autograd import Variable + +import random +import string +import sys +import math +import uuid +import numpy as np + +INF = 1e10 + +def positional_encodings_like(x, t=None): + if t is None: + positions = torch.arange(0, x.size(1)) + if x.is_cuda: + positions = positions.cuda(x.get_device()) + else: + positions = t + encodings = x.new(*x.size()[1:]).fill_(0) + if x.is_cuda: + encodings = encodings.cuda(x.get_device()) + + + for channel in range(x.size(-1)): + if channel % 2 == 0: + encodings[:, channel] = torch.sin( + positions.float() / 10000 ** (channel / x.size(2))) + else: + encodings[:, channel] = torch.cos( + positions.float() / 10000 ** ((channel - 1) / x.size(2))) + return Variable(encodings) + +class Linear(nn.Linear): + + def forward(self, x): + size = x.size() + return super().forward( + x.contiguous().view(-1, size[-1])).view(*size[:-1], -1) + +# F.softmax has strange default behavior, normalizing over dim 0 for 3D inputs +# deprecated since PyTorch 0.3 +# def softmax(x): +# if x.dim() == 3: +# return F.softmax(x.transpose(0, 2)).transpose(0, 2) +# return F.softmax(x) + +# torch.matmul can't do (4, 3, 2) @ (4, 2) -> (4, 3) +def matmul(x, y): + if x.dim() == y.dim(): + return x @ y + if x.dim() == y.dim() - 1: + return (x.unsqueeze(-2) @ y).squeeze(-2) + return (x @ y.unsqueeze(-2)).squeeze(-2) + +class LayerNorm(nn.Module): + + def __init__(self, d_model, eps=1e-6): + super().__init__() + self.gamma = nn.Parameter(torch.ones(d_model)) + self.beta = nn.Parameter(torch.zeros(d_model)) + self.eps = eps + + def forward(self, x): + mean = x.mean(-1, keepdim=True) + std = x.std(-1, keepdim=True) + return self.gamma * (x - mean) / (std + self.eps) + self.beta + +class ResidualBlock(nn.Module): + + def __init__(self, layer, d_model, drop_ratio): + super().__init__() + self.layer = layer + self.dropout = nn.Dropout(drop_ratio) + self.layernorm = LayerNorm(d_model) + + def forward(self, *x): + return self.layernorm(x[0] + self.dropout(self.layer(*x))) + +class Attention(nn.Module): + + def __init__(self, d_key, drop_ratio, causal): + super().__init__() + self.scale = math.sqrt(d_key) + self.dropout = nn.Dropout(drop_ratio) + self.causal = causal + + def forward(self, query, key, value): + dot_products = matmul(query, key.transpose(1, 2)) + if query.dim() == 3 and (self is None or self.causal): + tri = torch.ones(key.size(1), key.size(1)).triu(1) * INF + if key.is_cuda: + tri = tri.cuda(key.get_device()) + dot_products.data.sub_(tri.unsqueeze(0)) + return matmul(self.dropout(F.softmax(dot_products / self.scale, dim=2)), value) + +class MultiHead(nn.Module): + + def __init__(self, d_key, d_value, n_heads, drop_ratio, causal=False): + super().__init__() + self.attention = Attention(d_key, drop_ratio, causal=causal) + self.wq = Linear(d_key, d_key, bias=False) + self.wk = Linear(d_key, d_key, bias=False) + self.wv = Linear(d_value, d_value, bias=False) + self.wo = Linear(d_value, d_key, bias=False) + self.n_heads = n_heads + + def forward(self, query, key, value): + query, key, value = self.wq(query), self.wk(key), self.wv(value) + query, key, value = ( + x.chunk(self.n_heads, -1) for x in (query, key, value)) + return self.wo(torch.cat([self.attention(q, k, v) + for q, k, v in zip(query, key, value)], -1)) + +class FeedForward(nn.Module): + + def __init__(self, d_model, d_hidden): + super().__init__() + self.linear1 = Linear(d_model, d_hidden) + self.linear2 = Linear(d_hidden, d_model) + + def forward(self, x): + return self.linear2(F.relu(self.linear1(x))) + +class EncoderLayer(nn.Module): + + def __init__(self, d_model, d_hidden, n_heads, drop_ratio): + super().__init__() + self.selfattn = ResidualBlock( + MultiHead(d_model, d_model, n_heads, drop_ratio), + d_model, drop_ratio) + self.feedforward = ResidualBlock(FeedForward(d_model, d_hidden), + d_model, drop_ratio) + + def forward(self, x): + return self.feedforward(self.selfattn(x, x, x)) + +class Encoder(nn.Module): + + def __init__(self, d_model, d_hidden, n_vocab, n_layers, n_heads, + drop_ratio): + super().__init__() + # self.linear = nn.Linear(d_model*2, d_model) + self.layers = nn.ModuleList( + [EncoderLayer(d_model, d_hidden, n_heads, drop_ratio) + for i in range(n_layers)]) + self.dropout = nn.Dropout(drop_ratio) + + def forward(self, x, mask=None): + # x = self.linear(x) + x = x+positional_encodings_like(x) + x = self.dropout(x) + if mask is not None: + x = x*mask + encoding = [] + for layer in self.layers: + x = layer(x) + if mask is not None: + x = x*mask + encoding.append(x) + return encoding + +class Transformer(nn.Module): + + def __init__(self, d_model, n_vocab_src, vocab_trg, d_hidden=2048, + n_layers=6, n_heads=8, drop_ratio=0.1): + super().__init__() + self.encoder = Encoder(d_model, d_hidden, n_vocab_src, n_layers, + n_heads, drop_ratio) + + def denum(self, data): + return ' '.join(self.decoder.vocab.itos[i] for i in data).replace( + ' ', '#').replace(' ', '') + + def forward(self, x): + encoding = self.encoder(x) + + return encoding[-1], encoding + From 7d9c2e2a8fbd1996079cf408a1fef686a13a38d9 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sat, 31 Aug 2019 14:52:42 -0400 Subject: [PATCH 21/55] Big change in eval, plus some comments --- eval.py | 8 +++++--- metrics.py | 43 ++++++++++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/eval.py b/eval.py index 2d5ed2e..4ea504b 100644 --- a/eval.py +++ b/eval.py @@ -90,10 +90,12 @@ def eval(**args): with torch.no_grad(): for step, data in enumerate(eval_loader): - x_input = data['data'].to(device) + x_input = data['data'] + for i, item in enumerate(x_input): + if isinstance(item, torch.Tensor): + x_input[i] = item.to(device) annotations = data['annots'] - - outputs = model(x_input) + outputs = model(*x_input) acc = acc_metric.get_accuracy(outputs, annotations) diff --git a/metrics.py b/metrics.py index 0b6b6bd..1f4daf9 100644 --- a/metrics.py +++ b/metrics.py @@ -161,8 +161,8 @@ def __init__(self, threshold=0.5, num_points=101, *args, **kwargs): """ Compute Average Precision (AP) Args: - threshold (scalar): iou threshold - num_points (scalar): number of points to average for the interpolated AP calculation + threshold (float): iou threshold + num_points (int): number of points to average for the interpolated AP calculation Return: None @@ -188,10 +188,10 @@ def compute_class_ap(self, tp, fp, npos): Args: tp (Tensor, shape [N*D]): cumulative sum of true positive detections fp (Tensor, shape [N*D]): cumulative sum of false positive detections - npos (Tensor, scalar): actual positives (from ground truth) + npos (Tensor, int): actual positives (from ground truth) Return: - ap (Tensor, scalar): average precision calculation + ap (Tensor, float): average precision calculation """ #Values for precision-recall curve @@ -220,7 +220,7 @@ def get_AP(self, predictions, targets): D_: ground truth detections Return: - avg_ap (Tensor, scalar): mean ap across all classes + avg_ap (Tensor, float): mean ap across all classes """ N,C,D,_ = predictions.shape @@ -331,7 +331,7 @@ def __init__(self, threshold=torch.linspace(0.5,0.95,10), num_points=101, *args, Args: threshold (Tensor, shape[10]): Calculate AP at each of these threshold values - num_points (scalar): number of points to average for the interpolated AP calculation + num_points (float): number of points to average for the interpolated AP calculation """ self.threshold = threshold @@ -413,7 +413,7 @@ def __init__(self, threshold=0.5, det=None, *args, **kwargs): Compute Average Recall (AR) Args: - threshold: (scalar) + threshold: (float) det: max number of detections per image (optional) """ @@ -459,11 +459,11 @@ def __init__(self, threshold=0.5, num_points=11, *args, **kwargs): """ Compute Average Precision (AP) Args: - threshold (scalar): iou threshold - num_points (scalar): number of points to average for the interpolated AP calculation + threshold (float): iou threshold + num_points (int): number of points to average for the interpolated AP calculation final_shape (list) : [height, width] of input given to CNN result_dir (String): save detections to this location - ndata (scalar): total number of datapoints in dataset + ndata (int): total number of datapoints in dataset Return: None @@ -517,7 +517,14 @@ def get_accuracy(self, detections, data): class Box_Accuracy(): """ - Box accuracy computation + Box accuracy computation for YC2-BB model. + Adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/tools/test_util.py + + Args: + accu_thres: (float) iou threshold + fps: (int) frames per second video annotations were sampled at + load_type: (String) data split, only validation has publicly available annotations + ndata (int): total number of datapoints in dataset """ def __init__(self, *args, **kwargs): @@ -533,6 +540,20 @@ def __init__(self, *args, **kwargs): self.count = 0 def get_accuracy(self, predictions, data): + """ + Args: + predictions: (Tensor, shape [N,W,T,D]), attention weight output from model + data: (dictionary) + - rpn_original (Tensor, shape [N,T,D,4]) + - box (Tensor, shape [N,T,D,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) + - box_label (Tensor, shape [N,W]) + - vis_name (List, shape [N]), unique segment identifier + - class_labels_dict (dict, length 67) class index to class label mapping + + W: unique word in segment (from YC2BB class dictionary) + Return: + Box accuracy score + """ attn_weights = predictions N = attn_weights.shape[0] From e71ed17283e1d28240c37dbb95dff4582b223447 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Wed, 4 Sep 2019 14:08:00 -0400 Subject: [PATCH 22/55] Updated ApplyToPIL and added TranslateClip tests --- datasets/preprocessing_transforms.py | 52 +++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 23c5196..6ec22b2 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -594,6 +594,7 @@ class RandomTranslateClip(PreprocTransform): """ Random horizontal and/or vertical shift on frames in a clip Shift will be bounded by object bounding box (if given). Meaning, object will always be in view + Input numpy array must be of type np.uint8 Args: - translate (Tuple) @@ -606,7 +607,7 @@ def __init__(self, translate, **kwargs): self.max_x, self.max_y = translate assert(self.max_x >= 0.0 and self.max_y >= 0.0) - assert(self.max_x < 1.0 and self.max_y < 1.0) #Cannot shift pass image bounds + assert(self.max_x < 1.0 and self.max_y < 1.0) #Cannot shift past image bounds def _shift_frame(self, bbox, frame, tx, ty): M = np.array([[1, 0, tx],[0, 1, ty]], dtype=np.float) # 2 x 3 transformation matrix @@ -638,7 +639,7 @@ def __call__(self, clip, bbox=[]): out_bbox = [] for frame, box in zip(clip,bbox): - img_h, img_w, _ = frame.shape + img_h, img_w = frame.shape[:2] tx = int(img_w * frac_x) ty = int(img_h * frac_y) @@ -664,11 +665,12 @@ def __call__(self, clip, bbox=[]): return out_clip, out_bbox else: for frame in clip: - img_h, img_w, _ = frame.shape + img_h, img_w = frame.shape[:2] tx = int(img_w * frac_x) ty = int(img_h * frac_y) out_clip.append(self._shift_frame(None, frame, tx, ty)) + return out_clip class SubtractMeanClip(PreprocTransform): def __init__(self, **kwargs): @@ -735,11 +737,31 @@ def __init__(self, **kwargs): self.transform = kwargs['transform'](**self.class_kwargs) def __call__(self, clip, bbox=[]): + input_pil = True + output_clip = [] + if not isinstance(clip[0], Image.Image): clip = self._to_pil(clip) - output_clip = [] - for frame in clip: - output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy + clip = [frame.convert('RGB') for frame in clip] + input_pil = False + + if input_pil: + for frame in clip: + transformed_frame = self.transform(frame) + if isinstance(transformed_frame, tuple) or isinstance(transformed_frame, list): + for tf in transformed_frame: + output_clip.append(tf) + else: + output_clip.append(self.transform(frame)) #Apply transform and convert back to Numpy + + else: + for frame in clip: + transformed_frame = self.transform(frame) + if isinstance(transformed_frame, tuple) or isinstance(transformed_frame, list): + for tf in transformed_frame: + output_clip.append(np.array(tf)) + else: + output_clip.append(np.array(self.transform(frame))) #Apply transform and convert back to Numpy if bbox!=[]: return output_clip, bbox @@ -820,6 +842,7 @@ def __init__(self): self.rand_flip_h = RandomFlipClip(direction='h', p=1.0) self.rand_flip_v = RandomFlipClip(direction='v', p=1.0) self.rand_rot = RandomRotateClip(angles=[90]) + self.rand_trans = RandomTranslateClip(translate=(0.5,0.5)) self.sub_mean = SubtractMeanClip(clip_mean=np.zeros(1)) self.applypil = ApplyToPIL(transform=torchvision.transforms.ColorJitter, class_kwargs=dict(brightness=1)) self.applypil2 = ApplyToPIL(transform=torchvision.transforms.FiveCrop, class_kwargs=dict(size=(64,64))) @@ -916,6 +939,14 @@ def rand_rot_test(self): out_bbox = self.rand_rot([inp2], np.array([bbox]))[1][0].tolist() assert (False not in np.isclose(out, exp_out)) and (False not in np.isclose(exp_bbox, out_bbox)) + + def rand_trans_test(self): + x = np.arange(112*112).reshape(112,112).astype(np.uint8) + out = self.rand_trans([x]) + out2 = self.rand_trans([x], bbox=[np.array([[32,32,96,96]])]) + + assert (out2[1][0].min() >= 0) and (out[0].shape==(112,112)) and (out2[0][0].shape==(112,112)) + def rand_rot_vis(self): import matplotlib.pyplot as plt self.rand_rot._update_angles([20]) @@ -942,11 +973,13 @@ def rand_rot_vis(self): def applypil_test(self): inp = np.arange(112*112).reshape(112,112) + np_inp = [inp, inp] inp = self.applypil._to_pil([inp, inp]) inp = [inp[0].convert('RGB'), inp[1].convert('RGB')] - out1 = self.applypil(inp) - out = self.applypil2(out1) - assert (len(out)==2) and (len(out[0])==5) and (out[0][0].size==(64,64)) and (isinstance(out[0][0], Image.Image)) + out = self.applypil(inp) + out2 = self.applypil2(out) + out3 = self.applypil(np_inp) + assert (len(out2)==2*5) and (out2[0].size==(64,64)) and (isinstance(out2[0], Image.Image)) and (isinstance(out3[0], np.ndarray)) def applytensor_test(self): inp = np.arange(112*112*3).reshape(3,112,112).astype('float32') @@ -996,6 +1029,7 @@ def run_tests(self): self.rand_crop_test() self.rand_flip_test() self.rand_rot_test() + self.rand_trans_test() self.applypil_test() self.applytensor_test() self.applycv_test() From b3e943538328812b807328268ca651ff847130d9 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 5 Sep 2019 10:53:25 -0400 Subject: [PATCH 23/55] Add scaling on clips - need to select anchor for scaling --- datasets/preprocessing_transforms.py | 95 +++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 23c5196..b85557c 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -592,7 +592,7 @@ def __call__(self, clip, bbox=[]): class RandomTranslateClip(PreprocTransform): """ - Random horizontal and/or vertical shift on frames in a clip + Random horizontal and/or vertical shift on frames in a clip. All frames receive same shifting Shift will be bounded by object bounding box (if given). Meaning, object will always be in view Args: @@ -669,6 +669,99 @@ def __call__(self, clip, bbox=[]): ty = int(img_h * frac_y) out_clip.append(self._shift_frame(None, frame, tx, ty)) + return out_clip + +class RandomScaleClip(PreprocTransform): + """ + Random scaling on all frames in a clip. All frames receive same scaling + Shift will be bounded by object bounding box (if given). Meaning, object will always be in view + + Args: + - scale (Tuple) + - min_scale (float): minimum scaling on frame + - max_scale (float): maximum scaling on frame + """ + def __init__(self, scale, **kwargs): + super(RandomScaleClip, self).__init__(**kwargs) + + self.min_scale, self.max_scale = scale + + assert(self.min_scale <= self.max_scale) + + def _scale_frame(self, bbox, frame, sx, sy): + M = np.array([[sx, 0, 1],[0, sy, 1]], dtype=np.float) # 2 x 3 transformation matrix + out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0])) + + if bbox is not None: + bbox_h = np.reshape(bbox, (-1,2)) #x-y coords + bbox_h = np.concatenate((bbox_h, np.ones((bbox_h.shape[0],1))), axis=1).transpose() #homography coords + + out_box = M @ bbox_h + + if bbox.shape[-1] == 2: #Operate on point coordinates + out_box = np.reshape(out_box.transpose(), (bbox.shape[0], bbox.shape[1],2)) + else: #Operate on bounding box + out_box = np.reshape(out_box.transpose(), (-1,4)) + + return out_frame, out_box + else: + return out_frame + + def __call__(self, clip, bbox=[]): + out_clip = [] + clip = self._to_numpy(clip) + + sc = np.random.uniform(self.min_scale, self.max_scale) + print('Randomly selected scale: {}'.format(sc)) + + if bbox != []: + out_bbox = [] + + for frame, box in zip(clip,bbox): + img_h, img_w, _ = frame.shape + sx = np.ceil(img_w * sc) + sy = np.ceil(img_h * sc) + + #Bound scaling so all objects remain in scene + if box.shape[-1] == 2: #Operate on point coordinates + mask = box[:,:,0] != -1 + sx = min(img_w, np.max(sc*box[mask,0])) + sy = min(img_h, np.max(sc*box[mask,1])) + + if sx == img_w or sy == img_h: + sc = min(sx/np.max(box[mask,0]), sy/np.max(box[mask,1])) + else: + sc = min(sx/np.min(box[mask,0]), sy/np.min(box[mask,1])) + + out_frame, out_box = self._scale_frame(box, frame, sc, sc) + out_box[~mask] = -1*np.ones(2) + + else: #Operate on bounding box + #bbox is bounding box object + mask = box[:,0] != -1 + sx = min(img_w, np.max(sc*box[mask,2])) + sy = min(img_h, np.max(sc*box[mask,3])) + + if sx == img_w or sy == img_h: + sc = min(sx/np.max(box[mask,2]), sy/np.max(box[mask,3])) + else: + sc = min(sx/np.min(box[mask,2]), sy/np.min(box[mask,3])) + + out_frame, out_box = self._scale_frame(box, frame, sc, sc) + out_box[~mask] = -1*np.ones(4) + + out_clip.append(out_frame) + out_bbox.append(out_box) + + return out_clip, out_bbox + else: + for frame in clip: + img_h, img_w, _ = frame.shape + sx = int(img_w * sc) + sy = int(img_h * sc) + + out_clip.append(self._scale_frame(None, frame, sc, sc)) + return out_clip class SubtractMeanClip(PreprocTransform): def __init__(self, **kwargs): From d5b01037857d04ac545056197d3341ec8d868cc0 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 5 Sep 2019 17:39:14 -0400 Subject: [PATCH 24/55] Anchor scaling to center of image, changed name to RandomZoomClip --- datasets/preprocessing_transforms.py | 71 ++++++++++++++++------------ 1 file changed, 40 insertions(+), 31 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index e76e76f..44d8542 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -673,14 +673,14 @@ def __call__(self, clip, bbox=[]): return out_clip -class RandomScaleClip(PreprocTransform): +class RandomZoomClip(PreprocTransform): """ - Random scaling on all frames in a clip. All frames receive same scaling - Shift will be bounded by object bounding box (if given). Meaning, object will always be in view + Random zoom on all frames in a clip. All frames receive same scaling + Scale will be bounded by object bounding box (if given). Meaning, object will always be in view Args: - scale (Tuple) - - min_scale (float): minimum scaling on frame + - min_scale (float): minimum scaling on frame - max_scale (float): maximum scaling on frame """ def __init__(self, scale, **kwargs): @@ -688,10 +688,10 @@ def __init__(self, scale, **kwargs): self.min_scale, self.max_scale = scale - assert(self.min_scale <= self.max_scale) + assert(self.min_scale > 0 and self.min_scale <= self.max_scale) - def _scale_frame(self, bbox, frame, sx, sy): - M = np.array([[sx, 0, 1],[0, sy, 1]], dtype=np.float) # 2 x 3 transformation matrix + def _scale_frame(self, bbox, frame, sc): + M = cv2.getRotationMatrix2D((frame.shape[1]/2, frame.shape[0]/2), 0, sc) # 2 x 3 rotation matrix out_frame = cv2.warpAffine(frame, M, (frame.shape[1], frame.shape[0])) if bbox is not None: @@ -714,42 +714,51 @@ def __call__(self, clip, bbox=[]): clip = self._to_numpy(clip) sc = np.random.uniform(self.min_scale, self.max_scale) - print('Randomly selected scale: {}'.format(sc)) if bbox != []: out_bbox = [] for frame, box in zip(clip,bbox): - img_h, img_w, _ = frame.shape - sx = np.ceil(img_w * sc) - sy = np.ceil(img_h * sc) + img_h, img_w = frame.shape[:2] + cx, cy = (img_w/2, img_h/2) #Bound scaling so all objects remain in scene if box.shape[-1] == 2: #Operate on point coordinates mask = box[:,:,0] != -1 - sx = min(img_w, np.max(sc*box[mask,0])) - sy = min(img_h, np.max(sc*box[mask,1])) - - if sx == img_w or sy == img_h: - sc = min(sx/np.max(box[mask,0]), sy/np.max(box[mask,1])) - else: - sc = min(sx/np.min(box[mask,0]), sy/np.min(box[mask,1])) - out_frame, out_box = self._scale_frame(box, frame, sc, sc) + max_x = min(img_w, np.max(cx + sc * (box[mask,0] - cx))) + min_x = max(0, np.min(cx + sc * (box[mask,0] - cx))) + sx = (max_x - cx) / np.max(box[mask,0] - cx) + if min_x == 0: + sx = min(sx, (min_x - cx) / np.min(box[mask,0] - cx)) + + max_y = min(img_h, np.max(cy + sc * (box[mask,1] - cy))) + min_y = max(0, np.min(cy + sc * (box[mask,1] - cy))) + sy = (max_y - cy) / np.max(box[mask,1] - cy) + if min_y == 0: + sy = min(sy, (min_y - cy) / np.min(box[mask,1] - cy)) + + sc = min(sx, sy) + out_frame, out_box = self._scale_frame(box, frame, sc) out_box[~mask] = -1*np.ones(2) else: #Operate on bounding box - #bbox is bounding box object mask = box[:,0] != -1 - sx = min(img_w, np.max(sc*box[mask,2])) - sy = min(img_h, np.max(sc*box[mask,3])) - - if sx == img_w or sy == img_h: - sc = min(sx/np.max(box[mask,2]), sy/np.max(box[mask,3])) - else: - sc = min(sx/np.min(box[mask,2]), sy/np.min(box[mask,3])) - - out_frame, out_box = self._scale_frame(box, frame, sc, sc) + + max_x = min(img_w, np.max(cx + sc * (box[mask,2] - cx))) + min_x = max(0, np.min(cx + sc * (box[mask,0] - cx))) + sx = (max_x - cx) / np.max(box[mask,2] - cx) + if min_x == 0: + sx = min(sx, (min_x - cx) / np.min(box[mask,0] - cx)) + + max_y = min(img_h, np.max(cy + sc * (box[mask,3] - cy))) + min_y = max(0, np.min(cy + sc * (box[mask,1] - cy))) + sy = (max_y - cy) / np.max(box[mask,3] - cy) + if min_y == 0: + sy = min(sy, (min_y - cy) / np.min(box[mask,1] - cy)) + + sc = min(sx, sy) + out_frame, out_box = self._scale_frame(box, frame, sc) out_box[~mask] = -1*np.ones(4) out_clip.append(out_frame) @@ -758,11 +767,11 @@ def __call__(self, clip, bbox=[]): return out_clip, out_bbox else: for frame in clip: - img_h, img_w, _ = frame.shape + img_h, img_w = frame.shape[:2] sx = int(img_w * sc) sy = int(img_h * sc) - out_clip.append(self._scale_frame(None, frame, sc, sc)) + out_clip.append(self._scale_frame(None, frame, sc)) return out_clip class SubtractMeanClip(PreprocTransform): From c24d3eeb26d42eb9c998fcdf303768acab9450a3 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 11 Sep 2019 09:51:04 -0400 Subject: [PATCH 25/55] small fix --- datasets/preprocessing_transforms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 44d8542..b780073 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -684,7 +684,7 @@ class RandomZoomClip(PreprocTransform): - max_scale (float): maximum scaling on frame """ def __init__(self, scale, **kwargs): - super(RandomScaleClip, self).__init__(**kwargs) + super(RandomZoomClip, self).__init__(**kwargs) self.min_scale, self.max_scale = scale From 3dccbfb91c0d7deb788165f479d6bb72dcf356a9 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sat, 14 Sep 2019 16:27:41 -0400 Subject: [PATCH 26/55] Add default arg for resume, save predictions in result folder, remove features arg from eval.py --- eval.py | 6 +++--- models/c3d/c3d.py | 6 ++++-- models/c3d/config_test.yaml | 2 ++ models/c3d/config_train.yaml | 2 ++ parse_args.py | 3 ++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/eval.py b/eval.py index 41ab417..8b1a57f 100644 --- a/eval.py +++ b/eval.py @@ -99,7 +99,7 @@ def eval(**args): x_input = data['data'].to(device) annotations = data['annots'] - outputs = model(x_input, features=True) + outputs = model(x_input) if ret_data is None: ret_data = outputs.cpu().numpy() @@ -112,7 +112,7 @@ def eval(**args): # END IF - #acc = acc_metric.get_accuracy(outputs, annotations) + acc = acc_metric.get_accuracy(outputs, annotations) if step % 100 == 0: print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc)) @@ -124,7 +124,7 @@ def eval(**args): ret_dict['data'] = ret_data ret_dict['labels'] = ret_labels import scipy.io as sio - sio.savemat(args['load_type']+'_'+args['dataset']+'.mat', ret_dict) + sio.savemat(os.path.join(result_dir,args['load_type']+'_'+args['dataset']+'.mat'), ret_dict) writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc) # Close Tensorboard Element diff --git a/models/c3d/c3d.py b/models/c3d/c3d.py index 387ffd7..dfd78fc 100644 --- a/models/c3d/c3d.py +++ b/models/c3d/c3d.py @@ -55,7 +55,9 @@ def __init__(self, **kwargs): if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']: self.__load_pretrained_weights() - def forward(self, x, labels=False, features=False): + self.features = kwargs['model_features'] + + def forward(self, x, labels=False): x = self.relu(self.conv1(x)) x = self.pool1(x) @@ -78,7 +80,7 @@ def forward(self, x, labels=False, features=False): x = self.relu(self.fc6(x)) - if features: + if self.features: return x x = self.dropout(x) diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml index cf69c79..1d6d39d 100644 --- a/models/c3d/config_test.yaml +++ b/models/c3d/config_test.yaml @@ -36,3 +36,5 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay + +model_features: 0 # 1 - return model features (before prediction), 0 - return model prediction output diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml index 969982d..cf6f6e6 100644 --- a/models/c3d/config_train.yaml +++ b/models/c3d/config_train.yaml @@ -36,3 +36,5 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay + +model_features: 0 # 1 - return model features (before prediction), 0 - return model prediction output diff --git a/parse_args.py b/parse_args.py index 76b7ad4..7ac4994 100644 --- a/parse_args.py +++ b/parse_args.py @@ -77,7 +77,8 @@ def __init__(self): crop_type = None, num_clips = 1, debug = 0, - seed = 0) + seed = 0, + resume = 0) From 830850107f63356abc1988ce31c13d8c8d5e5ae6 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 26 Sep 2019 16:57:39 -0400 Subject: [PATCH 27/55] Fix MSCOCO loading and gen_json script now works with Python3 --- datasets/MSCOCO.py | 4 ++-- datasets/VOC2007.py | 1 - datasets/scripts/gen_json_mscoco.py | 19 ++++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/datasets/MSCOCO.py b/datasets/MSCOCO.py index 5d9868f..5805a47 100644 --- a/datasets/MSCOCO.py +++ b/datasets/MSCOCO.py @@ -1,6 +1,6 @@ import torch from .abstract_datasets import DetectionDataset -from PIL import Image +import cv2 import os import numpy as np import datasets.preprocessing_transforms as pt @@ -62,7 +62,7 @@ def __getitem__(self, idx): iscrowds[frame_ind, trackid] = iscrowd - input_data.append(Image.open(os.path.join(base_path, frame_path))) + input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]) vid_data, bbox_data = self.transforms(input_data, bbox_data) diff --git a/datasets/VOC2007.py b/datasets/VOC2007.py index d4764d8..b78c3c8 100644 --- a/datasets/VOC2007.py +++ b/datasets/VOC2007.py @@ -1,6 +1,5 @@ import torch from .abstract_datasets import DetectionDataset -from PIL import Image import cv2 import os import numpy as np diff --git a/datasets/scripts/gen_json_mscoco.py b/datasets/scripts/gen_json_mscoco.py index 1a6409a..c7db705 100755 --- a/datasets/scripts/gen_json_mscoco.py +++ b/datasets/scripts/gen_json_mscoco.py @@ -2,22 +2,25 @@ import os -def save_json(load_type): +year = '2014' +def save_json(load_type): + # Define path to mscoco images data base_img_path = '/path/to/mscoco/images/' ###### REPLACE with path to dataset base_annot_path = '/path/to/mscoco/annotations/'###### REPLACE with path to dataset - f = open(os.path.join(base_annot_path,'instances_'+load_type+'2014.json'),'r') - x = json.load(f) - f.close() + save_location = '/path/to/save/location' ######### REPLACE with save path + + with open(os.path.join(base_annot_path,'instances_'+load_type+year+'.json'),'r') as f: + x = json.load(f) imgids = [[idx['id'], idx['file_name'], idx['width'], idx['height']] for idx in x['images']] dd = {} for idx in imgids: frame_dict = dict(objs=[], img_path=idx[1]) - dd[idx[0]] = dict(frames=[frame_dict], base_path=os.path.join(base_img_path,load_type+'2014'), frame_size=[idx[2],idx[3]]) + dd[idx[0]] = dict(frames=[frame_dict], base_path=os.path.join(base_img_path,load_type+year), frame_size=[idx[2],idx[3]]) print('finished imgids') @@ -36,10 +39,8 @@ def save_json(load_type): if count%1000==0: print(count) - writef = open('mscoco_'+load_type+'.json', 'w') - json.dump(dd.values(), writef) - writef.close() - + with open(os.path.join(save_location,load_type+'.json'), 'w') as f: + json.dump(list(dd.values()), f) save_json('train') From 53421ae01e9a9ffc22c952f1093ecfdc87503ad9 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 26 Sep 2019 23:11:27 -0400 Subject: [PATCH 28/55] Add preprocessing test for zoom clip. Add/edit visualizations --- datasets/preprocessing_transforms.py | 120 +++++++++++++++++++++------ 1 file changed, 93 insertions(+), 27 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index b780073..63e1820 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -678,6 +678,10 @@ class RandomZoomClip(PreprocTransform): Random zoom on all frames in a clip. All frames receive same scaling Scale will be bounded by object bounding box (if given). Meaning, object will always be in view + >1: Zoom in + <1: Zoom out + =1: Same size + Args: - scale (Tuple) - min_scale (float): minimum scaling on frame @@ -945,6 +949,7 @@ def __init__(self): self.rand_flip_v = RandomFlipClip(direction='v', p=1.0) self.rand_rot = RandomRotateClip(angles=[90]) self.rand_trans = RandomTranslateClip(translate=(0.5,0.5)) + self.rand_zoom = RandomZoomClip(scale=(1.25,1.25)) self.sub_mean = SubtractMeanClip(clip_mean=np.zeros(1)) self.applypil = ApplyToPIL(transform=torchvision.transforms.ColorJitter, class_kwargs=dict(brightness=1)) self.applypil2 = ApplyToPIL(transform=torchvision.transforms.FiveCrop, class_kwargs=dict(size=(64,64))) @@ -1020,12 +1025,14 @@ def rand_flip_vis(self): x[:, 50] = 5000 x[10, :] = 5000 x[50, :] = 10000 - plt.imshow(x); plt.show() + + plt.subplot(1,3,1); plt.imshow(x); plt.title('Original image') h = self.rand_flip_h([x]) - plt.imshow(h[0]); plt.show() + plt.subplot(1,3,2); plt.imshow(h[0]); plt.title('Flip Horizontal') v = self.rand_flip_v([x]) - plt.imshow(v[0]); plt.show() - + plt.subplot(1,3,3); plt.imshow(v[0]); plt.title('Flip Vertical') + + plt.show() def rand_rot_test(self): inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float) @@ -1051,27 +1058,86 @@ def rand_trans_test(self): def rand_rot_vis(self): import matplotlib.pyplot as plt - self.rand_rot._update_angles([20]) + import matplotlib.patches as patches + self.rand_rot._update_angles([45]) x = np.arange(112*112).reshape(112,112) - #x = np.arange(6*6).reshape(6,6) - #bbox = [51,51,61,61] + bbox = [30,40,50,100] - bbox = [30,40,50,110] - #bbox = [2,2,4,4] - plt1 = x[:] - plt1[bbox[1]:bbox[3], bbox[0]] = 0 - plt1[bbox[1]:bbox[3], bbox[2]-1] = 0 - plt1[bbox[1], bbox[0]:bbox[2]] = 0 - plt1[bbox[3]-1, bbox[0]:bbox[2]] = 0 - plt.imshow(plt1); plt.show() + pts = np.array([[30,40],[30,80]]) + fig = plt.figure() + ax1 = fig.add_subplot(121) + x[bbox[1]:bbox[3], bbox[0]] = 0 + x[bbox[1]:bbox[3], bbox[2]-1] = 0 + x[bbox[1], bbox[0]:bbox[2]] = 0 + x[bbox[3]-1, bbox[0]:bbox[2]] = 0 + + ax1.imshow(x); ax1.set_title('Original image') + rect = patches.Rectangle((bbox[0],bbox[1]), bbox[2]-bbox[0],\ + bbox[3]-bbox[1], linewidth=1, edgecolor='k', facecolor='none') + #ax1.add_patch(rect) + ax1.scatter(pts[:,0], pts[:,1], c='r') + out2 = self.rand_rot([x], np.array([[bbox]])) - plt2 = out2[0][0] - bbox = out2[1][0][0].astype(int) - plt2[bbox[1]:bbox[3], bbox[0]] = 0 - plt2[bbox[1]:bbox[3], bbox[2]] = 0 - plt2[bbox[1], bbox[0]:bbox[2]] = 0 - plt2[bbox[3], bbox[0]:bbox[2]] = 0 - plt.imshow(plt2); plt.show() + x_rot = out2[0][0] + bbox_rot = out2[1][0,0] + + out2 = self.rand_rot([x], np.array([[pts]])) + pts_rot = out2[1][0,0] + + ax2 = fig.add_subplot(122) + rect = patches.Rectangle((bbox_rot[0],bbox_rot[1]), bbox_rot[2]-bbox_rot[0],\ + bbox_rot[3]-bbox_rot[1], linewidth=1, edgecolor='k', facecolor='none') + ax2.add_patch(rect) + ax2.imshow(x_rot); ax2.set_title('Rotation') + ax2.scatter(pts_rot[:,0],pts_rot[:,1], c='r') + plt.show() + + def rand_zoom_test(self): + inp = np.array([[[.1,.2,.3],[.4,.5,.6],[.7,.8,.9]]]).astype(float) + exp_out = np.array([[0.225 , 0.303125, 0.384375], + [0.459375, 0.5375 , 0.61875 ], + [0.703125, 0.78125 , 0.8625 ]]).astype(float) + out = self.rand_zoom(inp) + + inp2 = np.arange(6*6, dtype=np.uint8).reshape(6,6) + bbox = [[2,2,4,4]] + exp_bbox = [1.75,1.75,4.25,4.25] + _,out_bbox = self.rand_zoom([inp2], np.array([bbox])) + + assert (False not in np.isclose(out, exp_out)) and (False not in np.isclose(exp_bbox, out_bbox)) + + def rand_zoom_vis(self): + import matplotlib.pyplot as plt + import matplotlib.patches as patches + x = np.arange(112*112, dtype=np.uint8).reshape(112,112) + + bbox = [30,40,50,100] + pts = np.array([[30,40],[30,80]]) + fig = plt.figure() + ax1 = fig.add_subplot(121) + + x[bbox[1]:bbox[3], bbox[0]] = 0 + x[bbox[1]:bbox[3], bbox[2]-1] = 0 + x[bbox[1], bbox[0]:bbox[2]] = 0 + x[bbox[3]-1, bbox[0]:bbox[2]] = 0 + ax1.imshow(x); ax1.set_title('Original image') + ax1.scatter(pts[:,0], pts[:,1], c='r') + + out = self.rand_zoom([x], np.array([[pts]])) + pts_zoom = out[1][0][0] + + out = self.rand_zoom([x], np.array([[bbox]])) + x_zoom = out[0][0] + bbox_zoom = out[1][0][0] + + ax2 = fig.add_subplot(122) + rect = patches.Rectangle((bbox_zoom[0],bbox_zoom[1]), bbox_zoom[2]-bbox_zoom[0],\ + bbox_zoom[3]-bbox_zoom[1], linewidth=1, edgecolor='k', facecolor='none') + ax2.add_patch(rect) + ax2.imshow(x_zoom); ax2.set_title('Zoomed image') + ax2.scatter(pts_zoom[:,0],pts_zoom[:,1], c='r') + + plt.show() def applypil_test(self): inp = np.arange(112*112).reshape(112,112) @@ -1132,6 +1198,7 @@ def run_tests(self): self.rand_flip_test() self.rand_rot_test() self.rand_trans_test() + self.rand_zoom_test() self.applypil_test() self.applytensor_test() self.applycv_test() @@ -1139,12 +1206,11 @@ def run_tests(self): self.to_pil_test() self.to_numpy_test() print("Tests passed") - #self.rand_flip_vis() - #self.rand_rot_vis() - - - + self.rand_flip_vis() + self.rand_rot_vis() + self.rand_zoom_vis() + if __name__=='__main__': test = TestPreproc() test.run_tests() From 1fae0424f36531e6c477fadc8305f981afa44761 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 4 Oct 2019 10:13:32 -0400 Subject: [PATCH 29/55] Add numclips>0, clip stride, and clip_length=-1 options to extract clips. Also update all current datasets to allow for variable length video loading. --- config_default_example.yaml | 4 +-- datasets/HMDB51.py | 5 +-- datasets/ImageNetVID.py | 10 +++--- datasets/KTH.py | 6 ++-- datasets/MSCOCO.py | 9 ++--- datasets/Manual_Hands.py | 12 ++++--- datasets/UCF101.py | 5 +-- datasets/VOC2007.py | 9 ++--- datasets/abstract_datasets.py | 63 ++++++++++++++++++++++------------- models/c3d/config_test.yaml | 2 +- models/c3d/config_train.yaml | 2 +- models/ssd/config_test.yaml | 2 +- parse_args.py | 8 +++++ 13 files changed, 85 insertions(+), 52 deletions(-) diff --git a/config_default_example.yaml b/config_default_example.yaml index 30b6ee9..a523504 100644 --- a/config_default_example.yaml +++ b/config_default_example.yaml @@ -1,15 +1,13 @@ # Preprocessing clip_length: 16 # Number of frames within a clip clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) -clip_stride: 0 # Frame offset between successive frames +clip_stride: 1 # Frame offset between successive clips, must be >= 1 crop_shape: [112,112] # (Height, Width) of frame crop_type: Random # Type of cropping operation (Random, Central and None) final_shape: [112,112] # (Height, Width) of input to be given to CNN num_clips: -1 # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) random_offset: 0 # Boolean switch to generate a clip length sized clip from a video resize_shape: [128,171] # (Height, Width) to resize original data -sample_duration: 16 # Temporal size of video to be provided as input to the model -sample_size: 112 # Height of frame to be provided as input to the model subtract_mean: '' # Subtract mean (R,G,B) from all frames during preprocessing # Experiment Setup diff --git a/datasets/HMDB51.py b/datasets/HMDB51.py index 3d93bc3..6eb83a9 100644 --- a/datasets/HMDB51.py +++ b/datasets/HMDB51.py @@ -40,8 +40,9 @@ def __getitem__(self, idx): base_path = vid_info['base_path'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - labels = np.zeros((self.clip_length))-1 + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + labels = np.zeros((vid_length))-1 input_data = [] for frame_ind in range(len(vid_info['frames'])): diff --git a/datasets/ImageNetVID.py b/datasets/ImageNetVID.py index 1965c8c..ea180dc 100644 --- a/datasets/ImageNetVID.py +++ b/datasets/ImageNetVID.py @@ -42,10 +42,12 @@ def __getitem__(self, idx): vid_size = vid_info['frame_size'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1 - labels = np.zeros((self.clip_length, self.max_objects))-1 - occlusions = np.zeros((self.clip_length, self.max_objects))-1 + + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + bbox_data = np.zeros((vid_length, self.max_objects, 4))-1 + labels = np.zeros((vid_length, self.max_objects))-1 + occlusions = np.zeros((vid_length, self.max_objects))-1 diff --git a/datasets/KTH.py b/datasets/KTH.py index 95995ef..41c14a0 100644 --- a/datasets/KTH.py +++ b/datasets/KTH.py @@ -40,8 +40,10 @@ def __getitem__(self, idx): base_path = vid_info['base_path'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - labels = np.zeros((self.clip_length))-1 + + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + labels = np.zeros((vid_length))-1 input_data = [] for frame_ind in range(len(vid_info['frames'])): diff --git a/datasets/MSCOCO.py b/datasets/MSCOCO.py index 5d9868f..5b80cbf 100644 --- a/datasets/MSCOCO.py +++ b/datasets/MSCOCO.py @@ -34,10 +34,11 @@ def __getitem__(self, idx): vid_size = vid_info['frame_size'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1 - labels = np.zeros((self.clip_length, self.max_objects))-1 - iscrowds = np.zeros((self.clip_length, self.max_objects))-1 + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + bbox_data = np.zeros((vid_length, self.max_objects, 4))-1 + labels = np.zeros((vid_length, self.max_objects))-1 + iscrowds = np.zeros((vid_length, self.max_objects))-1 diff --git a/datasets/Manual_Hands.py b/datasets/Manual_Hands.py index d5c1a00..21916ef 100644 --- a/datasets/Manual_Hands.py +++ b/datasets/Manual_Hands.py @@ -46,11 +46,13 @@ def __getitem__(self, idx): vid_size = vid_info['frame_size'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1 - hand_pts_data = np.zeros((self.clip_length, self.max_objects, 21, 3))-1 - labels = np.zeros((self.clip_length, self.max_objects))-1 - occlusions = np.zeros((self.clip_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points + + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + bbox_data = np.zeros((vid_length, self.max_objects, 4))-1 + hand_pts_data = np.zeros((vid_length, self.max_objects, 21, 3))-1 + labels = np.zeros((vid_length, self.max_objects))-1 + occlusions = np.zeros((vid_length, self.max_objects, 22), dtype=np.int32)-1 #21 keypoints + background = 22 points for frame_ind in range(len(vid_info['frames'])): frame = vid_info['frames'][frame_ind] diff --git a/datasets/UCF101.py b/datasets/UCF101.py index 28ef78d..40c8b58 100644 --- a/datasets/UCF101.py +++ b/datasets/UCF101.py @@ -41,8 +41,9 @@ def __getitem__(self, idx): base_path = vid_info['base_path'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - labels = np.zeros((self.clip_length))-1 + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + labels = np.zeros((vid_length))-1 input_data = [] for frame_ind in range(len(vid_info['frames'])): diff --git a/datasets/VOC2007.py b/datasets/VOC2007.py index d4764d8..7d90145 100644 --- a/datasets/VOC2007.py +++ b/datasets/VOC2007.py @@ -53,10 +53,11 @@ def __getitem__(self, idx): vid_size = vid_info['frame_size'] input_data = [] - vid_data = np.zeros((self.clip_length, self.final_shape[0], self.final_shape[1], 3))-1 - bbox_data = np.zeros((self.clip_length, self.max_objects, 4))-1 - labels = np.zeros((self.clip_length, self.max_objects))-1 - diff_labels = np.zeros((self.clip_length, self.max_objects)) #difficult object labels + vid_length = len(vid_info['frames']) + vid_data = np.zeros((vid_length, self.final_shape[0], self.final_shape[1], 3))-1 + bbox_data = np.zeros((vid_length, self.max_objects, 4))-1 + labels = np.zeros((vid_length, self.max_objects))-1 + diff_labels = np.zeros((vid_length, self.max_objects)) #difficult object labels for frame_ind in range(len(vid_info['frames'])): frame = vid_info['frames'][frame_ind] diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py index 6e7fe2e..c34fc7f 100644 --- a/datasets/abstract_datasets.py +++ b/datasets/abstract_datasets.py @@ -68,17 +68,22 @@ def _extractClips(self, video): self.clip_stride: Number of frames between clips when extracting them from videos self.random_offset: Randomly select a clip_length sized clip from a video """ + if self.clip_offset > 0: + if len(video)-self.clip_offset >= self.clip_length: + video = video[self.clip_offset:] + if self.num_clips < 0: if len(video) >= self.clip_length: + # Uniformly sample one clip from the video final_video = [video[_idx] for _idx in np.linspace(0, len(video)-1, self.clip_length, dtype='int32')] final_video = [final_video] else: # Loop if insufficient elements - indices = np.ceil(self.clip_length/float(len(video))) + indices = np.ceil(self.clip_length/float(len(video))) # Number of times to repeat the video to exceed one clip_length indices = indices.astype('int32') - indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices) - indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')] + indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices) # Repeat the video indices until it exceeds a clip_length + indices = indices[np.linspace(0, len(indices)-1, self.clip_length, dtype='int32')] # Uniformly sample clip_length frames from the looped video final_video = [video[_idx] for _idx in indices] final_video = [final_video] @@ -87,8 +92,9 @@ def _extractClips(self, video): # END IF elif self.num_clips == 0: + # Divide entire video into the max number of clip_length segments if len(video) >= self.clip_length: - indices = np.arange(start=0, stop=len(video), step=self.clip_length) + indices = np.arange(start=0, stop=len(video)-self.clip_length+1, step=self.clip_stride) final_video = [] for _idx in indices: @@ -110,33 +116,44 @@ def _extractClips(self, video): # END IF else: - if self.random_offset: - if len(video) >= self.clip_length: - indices = np.random.choice(np.arange(len(video) - self.clip_length + 1), 1) - indices = indices.astype('int32') - indices = np.arange(indices, indices + self.clip_length).astype('int32') + # num_clips > 0, select exactly num_clips from a video - final_video = [video[_idx] for _idx in indices] - final_video = [final_video] + if self.clip_length == -1: + # This is a special case where we will return the entire video + # This setting can only be used when the batch size is set to 1 + return [video] - else: - indices = np.ceil(self.clip_length/float(len(video))) - indices = indices.astype('int32') - indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices) - index = np.random.choice(np.arange(len(indices) - self.clip_length + 1), 1)[0] - index = index.astype('int32') - indices = indices[index:index + self.clip_length] + required_length = (self.num_clips-1)*(self.clip_stride)+self.clip_length + + + if self.random_offset: + if len(video) >= required_length: + vid_start = np.random.choice(np.arange(len(video) - required_length + 1), 1) + video = video[int(vid_start):] - final_video = [video[_idx] for _idx in indices] - final_video = [final_video] + if len(video) >= required_length: + # Get indices of sequential clips overlapped by a clip_stride number of frames + indices = np.arange(0, len(video), self.clip_stride) - # END IF + # Select only the first num clips + indices = indices.astype('int32')[:self.num_clips] + + video = np.array(video) + final_video = [video[np.arange(_idx, _idx+self.clip_length).astype('int32')].tolist() for _idx in indices] else: - final_video = video[:self.clip_length] - final_video = [final_video] + # If the video is too small to get num_clips given the clip_length and clip_stride, loop it until you can + indices = np.ceil(required_length /float(len(video))) + indices = indices.astype('int32') + indices = np.tile(np.arange(0, len(video), 1, dtype='int32'), indices) + + # Starting index of each clip + clip_starts = np.arange(0, len(indices), self.clip_stride).astype('int32')[:self.num_clips] + video = np.array(video) + final_video = [video[indices[_idx:_idx+self.clip_length]].tolist() for _idx in clip_starts] + # END IF # END IF diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml index 1d6d39d..08f4e1c 100644 --- a/models/c3d/config_test.yaml +++ b/models/c3d/config_test.yaml @@ -1,7 +1,7 @@ # Preprocessing clip_length: 16 # Number of frames within a clip clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) -clip_stride: 0 # Frame offset between successive frames +clip_stride: 1 # Frame offset between successive frames crop_shape: [112,112] # (Height, Width) of frame crop_type: Random # Type of cropping operation (Random, Central and None) final_shape: [112,112] # (Height, Width) of input to be given to CNN diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml index cf6f6e6..65a0cb2 100644 --- a/models/c3d/config_train.yaml +++ b/models/c3d/config_train.yaml @@ -1,7 +1,7 @@ # Preprocessing clip_length: 16 # Number of frames within a clip clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) -clip_stride: 0 # Frame offset between successive frames +clip_stride: 1 # Frame offset between successive frames crop_shape: [112,112] # (Height, Width) of frame crop_type: Random # Type of cropping operation (Random, Central and None) final_shape: [112,112] # (Height, Width) of input to be given to CNN diff --git a/models/ssd/config_test.yaml b/models/ssd/config_test.yaml index a521b6d..2b630e2 100644 --- a/models/ssd/config_test.yaml +++ b/models/ssd/config_test.yaml @@ -1,7 +1,7 @@ # Preprocessing clip_length: 1 # Number of frames within a clip clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) -clip_stride: 0 # Frame offset between successive frames +clip_stride: 1 # Frame offset between successive frames crop_shape: [112,112] # (Height, Width) of frame crop_type: None # Type of cropping operation (Random, Central and None) final_shape: [300,300] # (Height, Width) of input to be given to CNN diff --git a/parse_args.py b/parse_args.py index 7ac4994..9a2500d 100644 --- a/parse_args.py +++ b/parse_args.py @@ -111,4 +111,12 @@ def get_args(self): if k not in yaml_keys: self.cfg_args[k] = self.defaults[k] + + # Force clip_stride to be >= 1 when extracting clips from a video + # This represents the # of frames between successive clips + if self.cfg_args['clip_stride'] < 1: + self.cfg_args['clip_stride'] = 1 + + + return self.cfg_args From fb7262a2a32e60aa56cae9de27ac28162ebd6e0a Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Fri, 4 Oct 2019 17:18:44 -0400 Subject: [PATCH 30/55] Add note of rotation angle --- datasets/preprocessing_transforms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 63e1820..78d0f4a 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -1059,7 +1059,8 @@ def rand_trans_test(self): def rand_rot_vis(self): import matplotlib.pyplot as plt import matplotlib.patches as patches - self.rand_rot._update_angles([45]) + angle = 45 + self.rand_rot._update_angles([angle]) x = np.arange(112*112).reshape(112,112) bbox = [30,40,50,100] @@ -1088,7 +1089,7 @@ def rand_rot_vis(self): rect = patches.Rectangle((bbox_rot[0],bbox_rot[1]), bbox_rot[2]-bbox_rot[0],\ bbox_rot[3]-bbox_rot[1], linewidth=1, edgecolor='k', facecolor='none') ax2.add_patch(rect) - ax2.imshow(x_rot); ax2.set_title('Rotation') + ax2.imshow(x_rot); ax2.set_title('Rotation: {} degress'.format(angle)) ax2.scatter(pts_rot[:,0],pts_rot[:,1], c='r') plt.show() From 4dd4ea9c3061adbf13b0f31a4a7c4758d5d4afe0 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sat, 5 Oct 2019 20:31:33 -0400 Subject: [PATCH 31/55] most general change for multiple inputs to model forward functions --- eval.py | 12 ++++++++---- metrics.py | 5 ++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/eval.py b/eval.py index 4ea504b..d575799 100644 --- a/eval.py +++ b/eval.py @@ -91,11 +91,15 @@ def eval(**args): with torch.no_grad(): for step, data in enumerate(eval_loader): x_input = data['data'] - for i, item in enumerate(x_input): - if isinstance(item, torch.Tensor): - x_input[i] = item.to(device) annotations = data['annots'] - outputs = model(*x_input) + + if isinstance(x_input, torch.Tensor): + outputs = model(x_input.to(device)) + else: + for i, item in enumerate(x_input): + if isinstance(item, torch.Tensor): + x_input[i] = item.to(device) + outputs = model(*x_input) acc = acc_metric.get_accuracy(outputs, annotations) diff --git a/metrics.py b/metrics.py index 1f4daf9..0c137c4 100644 --- a/metrics.py +++ b/metrics.py @@ -545,11 +545,14 @@ def get_accuracy(self, predictions, data): predictions: (Tensor, shape [N,W,T,D]), attention weight output from model data: (dictionary) - rpn_original (Tensor, shape [N,T,D,4]) - - box (Tensor, shape [N,T,D,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) + - box (Tensor, shape [N,O,T,5]), [cls_label, ytl, xtl, ybr, xbr] (note order in coordinates is different) - box_label (Tensor, shape [N,W]) - vis_name (List, shape [N]), unique segment identifier - class_labels_dict (dict, length 67) class index to class label mapping + T: number of frames + D: dimension of features + O: number of objects to ground W: unique word in segment (from YC2BB class dictionary) Return: Box accuracy score From 7a87f4da20087ed6412425a27db8d8bf3e57e7d8 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sun, 6 Oct 2019 21:15:23 -0400 Subject: [PATCH 32/55] Added assertion for batch size of 1 and clip length -1 --- datasets/abstract_datasets.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datasets/abstract_datasets.py b/datasets/abstract_datasets.py index c34fc7f..a159b25 100644 --- a/datasets/abstract_datasets.py +++ b/datasets/abstract_datasets.py @@ -40,6 +40,9 @@ def __init__(self, *args, **kwargs): self.crop_type = kwargs['crop_type'] self.final_shape = kwargs['final_shape'] + #Experiment arguments + self.batch_size = kwargs['batch_size'] + # Creates the self.samples list which will be indexed by each __getitem__ call self._getClips() @@ -120,7 +123,10 @@ def _extractClips(self, video): if self.clip_length == -1: # This is a special case where we will return the entire video - # This setting can only be used when the batch size is set to 1 + + # Batch size must equal one or dataloader items may have varying lengths + # and can't be stacked i.e. throws an error + assert(self.batch_size == 1) return [video] From 3b25314ba7343bfe44636e13f1ad26f44fd0c7dc Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Sun, 6 Oct 2019 21:45:02 -0400 Subject: [PATCH 33/55] Computes accuracy on validation, needs to write to JSON file for testing --- eval.py | 14 -------------- metrics.py | 6 ------ 2 files changed, 20 deletions(-) diff --git a/eval.py b/eval.py index ed9e8bb..0df0b8b 100644 --- a/eval.py +++ b/eval.py @@ -107,14 +107,6 @@ def eval(**args): x_input[i] = item.to(device) outputs = model(*x_input) - if ret_data is None: - ret_data = outputs.cpu().numpy() - ret_labels = annotations['labels'].cpu().numpy()[:, 0] - - else: - ret_data = np.vstack((ret_data, outputs.cpu().numpy())) - ret_labels = np.hstack((ret_labels, annotations['labels'].cpu().numpy()[:, 0])) - # END IF @@ -126,12 +118,6 @@ def eval(**args): print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc)) if not args['debug']: - ret_dict = {} - ret_dict['data'] = ret_data - ret_dict['labels'] = ret_labels - import scipy.io as sio - sio.savemat(os.path.join(result_dir,args['load_type']+'_'+args['dataset']+'.mat'), ret_dict) - writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc) # Close Tensorboard Element writer.close() diff --git a/metrics.py b/metrics.py index 0c137c4..3b56bd9 100644 --- a/metrics.py +++ b/metrics.py @@ -637,11 +637,6 @@ def get_accuracy(self, predictions, data): for (i,h,m) in results: self.ba_score[i].append((h,m)) - - self.count += N - if self.count < self.ndata: - return -1 - if self.test_mode: #Annotations for the testing split are not publicly available return -1 @@ -657,5 +652,4 @@ def get_accuracy(self, predictions, data): #print('BA for {}(...): {:.4f}'.format(k, cur_hit/(cur_hit+cur_miss))) ba_final.append(cur_hit/(cur_hit+cur_miss)) - #print('The overall BA is: {:.4f}'.format(np.mean(ba_final))) return np.mean(ba_final) From ba2edb33566a62bebe91d225ec83520cf5cf46dd Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Tue, 8 Oct 2019 17:42:25 -0400 Subject: [PATCH 34/55] Generate JSON submissions file for test split to submit to eval server --- datasets/YC2BB.py | 6 +++--- metrics.py | 52 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 11 deletions(-) diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py index 0f511f8..0123a0e 100644 --- a/datasets/YC2BB.py +++ b/datasets/YC2BB.py @@ -26,7 +26,7 @@ def __init__(self, *args, **kwargs): self.load_type = kwargs['load_type'] - self.max_objects = 15 + self.max_objects = 20 self.class_dict = _get_class_labels(class_file) ''' if self.load_type=='train': @@ -124,10 +124,10 @@ def __getitem__(self, idx): trackid = obj['trackid'] if self.load_type == 'test' or self.load_type == 'train': #Annotations for test set not publicly available, train not annotated - bbox_data[trackid, frame_ind] = -1*np.ones(5) + bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] else: if obj['occ'] or obj['outside']: - bbox_data[trackid, frame_ind] = -1*np.ones(5) + bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] else: obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] diff --git a/metrics.py b/metrics.py index 3b56bd9..d051bd7 100644 --- a/metrics.py +++ b/metrics.py @@ -1,6 +1,9 @@ -import torch +import os +import json import numpy as np +import torch + class Metrics(object): def __init__(self, *args, **kwargs): """ @@ -530,14 +533,26 @@ class Box_Accuracy(): def __init__(self, *args, **kwargs): from collections import defaultdict - self.thresh = kwargs['accu_thresh'] - self.fps = kwargs['fps'] - self.test_mode = 1 if kwargs['load_type'] == 'test' else 0 - self.IOU = IOU() - self.ba_score = defaultdict(list) #box accuracy metric + self.result_dir = os.path.join(kwargs['result_dir'], 'submission_yc2_bb.json') + self.thresh = kwargs['accu_thresh'] + self.fps = kwargs['fps'] + self.debug = kwargs['debug'] + self.test_mode = 1 if kwargs['load_type'] == 'test' else 0 + self.IOU = IOU() + self.ba_score = defaultdict(list) #box accuracy metric + + if self.test_mode: + print('*'*62) + print('* [WARNING] Eval unavailable for the test set! *\ + \n* Results will be saved to: '+self.result_dir+' *\ + \n* Please submit your results to the eval server! *') + print('*'*62) self.ndata = kwargs['ndata'] self.count = 0 + + self.json_data = {} + self.database = {} def get_accuracy(self, predictions, data): """ @@ -560,6 +575,7 @@ def get_accuracy(self, predictions, data): attn_weights = predictions N = attn_weights.shape[0] + self.count += N rpn_batch = data['rpn_original'] box_batch = data['box'] @@ -636,8 +652,28 @@ def get_accuracy(self, predictions, data): for (i,h,m) in results: self.ba_score[i].append((h,m)) - - if self.test_mode: #Annotations for the testing split are not publicly available + + #Annotations for the testing split are not publicly available + if self.test_mode: + split, rec, video_name, segment = vis_name[0].split('_-_') + + if video_name not in self.database: + self.database[video_name] = {} + self.database[video_name]['recipe_type'] = rec + if 'segments' not in self.database[video_name]: + self.database[video_name]['segments'] = {} + + self.database[video_name]['segments'][int(segment)] = segment_dict + + #Predictions will be saved to JSON file (if not in debug mode) + if self.count >= self.ndata and not self.debug: + self.json_data['database'] = self.database + + with open(self.result_dir, 'w') as f: + json.dump(self.json_data, f) + + print('Saved submission file to: {}'.format(self.result_dir)) + return -1 ba_final = [] From b89da503804463e337fb59ab67aa4142be83c960 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 9 Oct 2019 11:17:16 -0400 Subject: [PATCH 35/55] Add training for YC2BB model, need to clean-up and add comments --- datasets/YC2BB.py | 144 ++++++++++++++++++++++++++++++++------------ losses.py | 56 ++++++++++++++++- models/dvsa/dvsa.py | 9 ++- train.py | 30 ++++++--- 4 files changed, 186 insertions(+), 53 deletions(-) diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py index 0123a0e..d15d5d4 100644 --- a/datasets/YC2BB.py +++ b/datasets/YC2BB.py @@ -23,17 +23,13 @@ def __init__(self, *args, **kwargs): num_proposals = kwargs['yc2bb_num_proposals'] rpn_proposal_root = kwargs['yc2bb_rpn_proposal_root'] roi_pooled_feat_root = kwargs['yc2bb_roi_pooled_feat_root'] + self.num_frm = kwargs['yc2bb_num_frm'] self.load_type = kwargs['load_type'] self.max_objects = 20 + self.num_class = kwargs['labels'] self.class_dict = _get_class_labels(class_file) - ''' - if self.load_type=='train': - self.transforms = kwargs['model_obj'].train_transforms - else: - self.transforms = kwargs['model_obj'].test_transforms - ''' sentences_proc, segments_tuple = _get_segments_and_sentences(self.samples, self.load_type) @@ -64,35 +60,38 @@ def __init__(self, *args, **kwargs): self.num_proposals = num_proposals self.roi_pooled_feat_root = roi_pooled_feat_root - ''' - with open(self.gt_box_file, 'r') as f: - self.data_all = json.load(f) - - # read gt bounding boxes O x T/25 x (id, ytl, xtl, ybr, xbr) - # coordinates are 0-indexed - for i, t in enumerate(segments_tuple): - vid = t[2] - seg = str(int(t[3])) - - # if video has no annotations, continue - if not vid in self.data_all['database']: - continue - - # check if ground truth bounding box exists for segment - if seg in self.data_all['database'][vid]['segments'].keys(): - s = sentences_proc[i] - inc_flag = 0 + #Extract all dictionary words from each input sentence + #Only for the training set b/c it's un-annotated + self.sample_obj_labels = [] + idx_to_remove = [] + if self.load_type == 'train': + total_seg = len(self.samples) + for idx, sample in enumerate(self.samples): + sentence = sample['frames'][0]['sentence'].split(' ') obj_label = [] - for w in s: - if self.class_dict.get(w, -1) >= 0: - obj_label.append(self.class_dict[w]) + inc_flag = 0 + for w in sentence: + if self.class_dict.get(w,-1) >= 0: + obj_label.append(self.class_dict[w]) inc_flag = 1 if inc_flag: - self.sample_lst.append((t, obj_label)) + self.sample_obj_labels.append(obj_label) + else: + idx_to_remove.append(idx) + + #Remove segments without object from dictionay + self.samples[:] = [s for idx,s in enumerate(self.samples) if idx not in idx_to_remove] + + assert(len(self.samples) == len(self.sample_obj_labels)) + + print('{}/{} valid segments in {} split'.format(len(self.samples), total_seg, self.load_type)) - print('# of segments for {}: {}, percentage in the raw data: {:.2f}'.format( - split_lst, len(self.sample_lst), len(self.sample_lst)/len(sentences_proc))) + ''' + if self.load_type=='train': + self.transforms = kwargs['model_obj'].train_transforms + else: + self.transforms = kwargs['model_obj'].test_transforms ''' #Reverse-mapping between class index to canonical label name @@ -192,15 +191,84 @@ def __getitem__(self, idx): vis_name = '_-_'.join((self.yc2_split, rec, vid, seg)) ret_dict = dict() - ret_dict['data'] = [x_rpn, obj_label, self.load_type] - annot_dict = dict() - annot_dict['box'] = bbox_data - annot_dict['box_label'] = obj_label - annot_dict['rpn'] = rpn - annot_dict['rpn_original'] = rpn_original - annot_dict['vis_name'] = vis_name - annot_dict['class_labels_dict'] = self._get_class_labels_reverse() + + if self.load_type == 'train': #Training input data is generated differently + # randomly sample 5 frames from 5 uniform intervals + T = x_rpn.size(1) + itv = T*1./self.num_frm + ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)] + x_rpn = x_rpn[:, ind, :] + + obj_label = self.sample_obj_labels[idx] + + #Generate positive example + obj_tensor = torch.tensor(obj_label, dtype=torch.long) + obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding + pos_sample = [x_rpn, obj_tensor] + + #Sample negative example + total_s = len(self.samples) + neg_index = np.random.randint(total_s) + #Shouldn't include any overlapping object + while len(set(obj_label).intersection(set(self.sample_obj_labels[neg_index]))) != 0: + neg_index = np.random.randint(total_s) + + vid_info = self.samples[neg_index] + + base_path = vid_info['base_path'] + width, height = vid_info['frame_size'] + num_frames_1fps = len(vid_info['frames']) + rec = base_path.split('/')[-3] + vid = base_path.split('/')[-2] + seg = base_path.split('/')[-1] + + # rpn object propoals + rpn = [] + x_rpn = [] + frm=1 + + feat_name = vid+'_'+seg+'.pth' + img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg' + x_rpn = torch.load(os.path.join(self.roi_pooled_feat_root, self.yc2_split, feat_name)) + while self.rpn_dict.get(img_name, -1) > -1: + ind = self.rpn_dict[img_name] + rpn.append(self.rpn_chunk[ind]) + frm+=1 + img_name = vid+'_'+seg+'_'+str(frm).zfill(4)+'.jpg' + + rpn = torch.stack(rpn) # number of frames x number of proposals per frame x 4 + rpn = rpn[:, :self.num_proposals, :] + + x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals + x_rpn = x_rpn[:, :, :self.num_proposals] + + # randomly sample 5 frames from 5 uniform intervals + T = x_rpn.size(1) + itv = T*1./self.num_frm + ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)] + x_rpn = x_rpn[:, ind, :] + + #Generate negative example + neg_obj_label = self.sample_obj_labels[neg_index] + obj_tensor = torch.tensor(neg_obj_label, dtype=torch.long) + obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(neg_obj_label)).fill_(self.num_class))) #padding + neg_sample = [x_rpn, obj_tensor] + + output = [torch.stack(i) for i in zip(pos_sample, neg_sample)] + output.append(self.load_type) + ret_dict['data'] = output + + else: #Validation or Testing set + ret_dict['data'] = [x_rpn, obj_label, self.load_type] + + annot_dict['box'] = bbox_data + annot_dict['box_label'] = obj_label + annot_dict['rpn'] = rpn + annot_dict['rpn_original'] = rpn_original + annot_dict['vis_name'] = vis_name + annot_dict['class_labels_dict'] = self._get_class_labels_reverse() + ret_dict['annots'] = annot_dict return ret_dict diff --git a/losses.py b/losses.py index c11d14d..2b2a3cb 100644 --- a/losses.py +++ b/losses.py @@ -1,10 +1,12 @@ -import torch -import torch.nn as nn import numpy as np from scipy import ndimage import os import cv2 +import torch +import torch.nn as nn +import torch.nn.functional as F + class Losses(object): def __init__(self, *args, **kwargs): #loss_type, size_average=None, reduce=None, reduction='mean', *args, **kwargs): @@ -27,6 +29,9 @@ def __init__(self, *args, **kwargs): #loss_type, size_average=None, reduce=None, elif self.loss_type == 'M_XENTROPY': self.loss_object = M_XENTROPY(*args, **kwargs) + elif self.loss_type == 'YC2BB_Attention_Loss': + self.loss_object = YC2BB_Attention_Loss(*args, **kwargs) + else: print('Invalid loss type selected. Quitting!') exit(1) @@ -107,3 +112,50 @@ def loss(self, predictions, data): one_hot = torch.Tensor(one_hot).cuda() return torch.mean(torch.sum(-one_hot * self.logsoftmax(predictions), dim=1)) + +class YC2BB_Attention_Loss(object): + def __init__(self, *args, **kwargs): + """ + Frame-wise attention loss used in: + + Weakly-supervised, no groundtruth labels are used. + """ + + self.loss_weighting = kwargs['has_loss_weighting'] + self.obj_interact = kwargs['obj_interact'] + self.ranking_margin = kwargs['ranking_margin'] + self.loss_factor = kwargs['loss_factor'] + + def loss(self, predictions, data): + """ + Args: + predictions (List): + - output (): + - loss weighting(): + data (NoneType) + + Return: + Frame-wise weighting loss + """ + output, loss_weigh = predictions + + if self.loss_weighting or self.obj_interact: + rank_batch = F.margin_ranking_loss(output[:,0:1], output[:,1:2], + torch.ones(output.size()).type(output.data.type()), margin=self.ranking_margin, reduction='none') + if self.loss_weighting and self.obj_interact: + loss_weigh = (output[:, 0:1]+loss_weigh)/2. # avg + elif self.loss_weighting: + loss_weigh = output[:,0:1] + else: + loss_weigh = loss_weigh.unsqueeze(1) + # ranking loss + cls_loss = self.loss_factor*(rank_batch*loss_weigh).mean()+ \ + (1-self.loss_factor)*-torch.log(2*loss_weigh).mean() + else: + # ranking loss + cls_loss = F.margin_ranking_loss(output[:,0:1], output[:,1:2], + torch.Tensor([[1],[1]]).type(output.data.type()), margin=self.ranking_margin) + + + return cls_loss + diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py index 0543036..2092345 100644 --- a/models/dvsa/dvsa.py +++ b/models/dvsa/dvsa.py @@ -1,7 +1,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch.autograd import Variable import math import numpy as np from functools import partial @@ -14,7 +13,7 @@ class DVSA(nn.Module): # def __init__(self, num_class, input_size=2048, enc_size=128, dropout=0.2, hidden_size=256, n_layers=1, n_heads=4, attn_drop=0.2, num_frm=5, has_loss_weighting=False): def __init__(self, **kwargs): super().__init__() - num_class = kwargs['num_class'] + num_class = kwargs['labels'] input_size = kwargs['input_size'] enc_size = kwargs['enc_size'] dropout = kwargs['dropout'] @@ -22,7 +21,7 @@ def __init__(self, **kwargs): n_layers = kwargs['n_layers'] n_heads = kwargs['n_heads'] attn_drop = kwargs['attn_drop'] - num_frm = kwargs['num_frm'] + num_frm = kwargs['yc2bb_num_frm'] has_loss_weighting = kwargs['has_loss_weighting'] # encode the region feature @@ -62,6 +61,10 @@ def forward(self, x_o, obj, load_type): if is_evaluate: return self.output_attn(x_o, obj) + #only a single batch expected + x_o = x_o[0] + obj = obj[0] + x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous() x_o = torch.stack([x_o[0], x_o[1], x_o[0]]) diff --git a/train.py b/train.py index 7f66ef4..b68440e 100644 --- a/train.py +++ b/train.py @@ -126,7 +126,7 @@ def train(**args): # END IF model_loss = Losses(device=device, **args) - acc_metric = Metrics(**args) + acc_metric = Metrics(**args, result_dir=result_dir, ndata=len(valid_loader.dataset)) best_val_acc = 0.0 ############################################################################################################################################################################ @@ -148,17 +148,27 @@ def train(**args): # END IF - x_input = data['data'].to(device) - annotations = data['annots'] + x_input = data['data'] + annotations = data['annots'] + + if isinstance(x_input, torch.Tensor): + mini_batch_size = x_input.shape[0] + outputs = model(x_input.to(device)) + + assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument" + else: #Model takes several inputs in forward function + mini_batch_size = x_input[0].shape[0] #Assuming the first element contains the true data input + for i, item in enumerate(x_input): + if isinstance(item, torch.Tensor): + x_input[i] = item.to(device) + outputs = model(*x_input) - assert args['final_shape']==list(x_input.size()[-2:]), "Input to model does not match final_shape argument" - outputs = model(x_input) loss = model_loss.loss(outputs, annotations) - loss = loss * outputs.shape[0] + loss = loss * mini_batch_size loss.backward() running_loss += loss.item() - running_batch += outputs.shape[0] + running_batch += mini_batch_size if np.isnan(running_loss): import pdb; pdb.set_trace() @@ -173,12 +183,12 @@ def train(**args): # END FOR # Add Loss Element - writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/outputs.shape[0], epoch*len(train_loader) + step) + writer.add_scalar(args['dataset']+'/'+args['model']+'/minibatch_loss', loss.item()/mini_batch_size, epoch*len(train_loader) + step) # END IF - if ((epoch*len(train_loader) + step+1) % 100 == 0): - print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/outputs.shape[0])) + if ((epoch*len(train_loader) + step+1) % 1 == 0): + print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/mini_batch_size)) # END IF From 2dc132f2f4f5eade2086a0128cf21b67e8be5fca Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 9 Oct 2019 16:04:13 -0400 Subject: [PATCH 36/55] Clean up parts of code, added comments and code sources --- datasets/YC2BB.py | 52 +++++++++++++++++++++++---------------------- losses.py | 11 ++++++---- models/dvsa/dvsa.py | 15 +++++++++---- train.py | 18 ++++++++++++---- 4 files changed, 59 insertions(+), 37 deletions(-) diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py index d15d5d4..0840a06 100644 --- a/datasets/YC2BB.py +++ b/datasets/YC2BB.py @@ -11,8 +11,12 @@ class YC2BB(DetectionDataset): ''' - YouCook2-Bounding Boxes dataset. Used in weakly-supervised video object grounding task + YouCook2-Bounding Boxes dataset. Introduced in weakly-supervised video object grounding task Paper: https://arxiv.org/pdf/1805.02834.pdf + + training: no bounding box annotations, only sentence describing sentence + validation: bounding box annotations and grounded words available + testing: bounding box annotations not publicly available, only grounded words ''' def __init__(self, *args, **kwargs): super(YC2BB, self).__init__(*args, **kwargs) @@ -97,6 +101,23 @@ def __init__(self, *args, **kwargs): #Reverse-mapping between class index to canonical label name def _get_class_labels_reverse(self): return {v:k for k,v in self.class_dict.items()} + + #For the training set, extract positive and negative samples + def sample_rpn_regions(self, x_rpn, idx): + # randomly sample 5 frames from 5 uniform intervals + T = x_rpn.size(1) + itv = T*1./self.num_frm + ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)] + x_rpn = x_rpn[:, ind, :] + + obj_label = self.sample_obj_labels[idx] + + #Generate example + obj_tensor = torch.tensor(obj_label, dtype=torch.long) + obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding + sample = [x_rpn, obj_tensor] + + return sample def __getitem__(self, idx): vid_info = self.samples[idx] @@ -194,23 +215,13 @@ def __getitem__(self, idx): annot_dict = dict() if self.load_type == 'train': #Training input data is generated differently - # randomly sample 5 frames from 5 uniform intervals - T = x_rpn.size(1) - itv = T*1./self.num_frm - ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)] - x_rpn = x_rpn[:, ind, :] + #Generate postive example + pos_sample = self.sample_rpn_regions(x_rpn, idx) - obj_label = self.sample_obj_labels[idx] - - #Generate positive example - obj_tensor = torch.tensor(obj_label, dtype=torch.long) - obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(obj_label)).fill_(self.num_class))) #padding - pos_sample = [x_rpn, obj_tensor] - - #Sample negative example + #Sample negative index total_s = len(self.samples) neg_index = np.random.randint(total_s) - #Shouldn't include any overlapping object + #Shouldn't include any overlapping object in description while len(set(obj_label).intersection(set(self.sample_obj_labels[neg_index]))) != 0: neg_index = np.random.randint(total_s) @@ -243,17 +254,8 @@ def __getitem__(self, idx): x_rpn = x_rpn.permute(2,0,1).contiguous() # encoding size x number of frames x number of proposals x_rpn = x_rpn[:, :, :self.num_proposals] - # randomly sample 5 frames from 5 uniform intervals - T = x_rpn.size(1) - itv = T*1./self.num_frm - ind = [min(T-1, int((i+np.random.rand())*itv)) for i in range(self.num_frm)] - x_rpn = x_rpn[:, ind, :] - #Generate negative example - neg_obj_label = self.sample_obj_labels[neg_index] - obj_tensor = torch.tensor(neg_obj_label, dtype=torch.long) - obj_tensor = torch.cat((obj_tensor, torch.LongTensor(self.max_objects - len(neg_obj_label)).fill_(self.num_class))) #padding - neg_sample = [x_rpn, obj_tensor] + neg_sample = self.sample_rpn_regions(x_rpn, neg_index) output = [torch.stack(i) for i in zip(pos_sample, neg_sample)] output.append(self.load_type) diff --git a/losses.py b/losses.py index 2b2a3cb..ad3710e 100644 --- a/losses.py +++ b/losses.py @@ -113,10 +113,12 @@ def loss(self, predictions, data): return torch.mean(torch.sum(-one_hot * self.logsoftmax(predictions), dim=1)) +#Code source: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/train.py class YC2BB_Attention_Loss(object): def __init__(self, *args, **kwargs): """ - Frame-wise attention loss used in: + Frame-wise attention loss used in Weakly-Supervised Object Video Grounding... + https://arxiv.org/pdf/1805.02834.pdf Weakly-supervised, no groundtruth labels are used. """ @@ -130,10 +132,11 @@ def loss(self, predictions, data): """ Args: predictions (List): - - output (): - - loss weighting(): - data (NoneType) + - output (Tensor, shape [2*T, 2]): Positive and negative attention weights for each sample + - loss_weigh (Tensor, shape [2*T, 1]): Loss weighting applied to each sampled frame + data (None) + T: number of sampled frames from video (default: 5) Return: Frame-wise weighting loss """ diff --git a/models/dvsa/dvsa.py b/models/dvsa/dvsa.py index 2092345..702e020 100644 --- a/models/dvsa/dvsa.py +++ b/models/dvsa/dvsa.py @@ -1,3 +1,5 @@ +#Code heavily adapted from: https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/model/dvsa.py + import torch import torch.nn as nn import torch.nn.functional as F @@ -9,8 +11,14 @@ from models.dvsa.dvsa_utils.transformer import Transformer class DVSA(nn.Module): - -# def __init__(self, num_class, input_size=2048, enc_size=128, dropout=0.2, hidden_size=256, n_layers=1, n_heads=4, attn_drop=0.2, num_frm=5, has_loss_weighting=False): + """ + Deep Visual-Semantic Alignments (DVSA). + Implementation used as baseline in Weakly-Supervised Video Object Grounding... + Source: https://arxiv.org/pdf/1805.02834.pdf + + Original paper: Deep visual-semantic alignments for generating image descriptions + https://cs.stanford.edu/people/karpathy/cvpr2015.pdf + """ def __init__(self, **kwargs): super().__init__() num_class = kwargs['labels'] @@ -125,7 +133,7 @@ def output_attn(self, x_o, obj): x_o = self.feat_enc(x_o.permute(0,2,3,1).contiguous()).permute(0,3,1,2).contiguous() N, C_out, T, num_proposals = x_o.size() - assert(N == 1) # two pos samples and one neg sample + assert(N == 1) # attention O = obj.size(1) @@ -147,4 +155,3 @@ def _load_pretrained_weights(self): state_dict = torch.load('weights/yc2bb_full-model.pth', map_location=lambda storage, location: storage) self.load_state_dict(state_dict) - diff --git a/train.py b/train.py index b68440e..8f44f57 100644 --- a/train.py +++ b/train.py @@ -187,7 +187,7 @@ def train(**args): # END IF - if ((epoch*len(train_loader) + step+1) % 1 == 0): + if ((epoch*len(train_loader) + step+1) % 100 == 0): print('Epoch: {}/{}, step: {}/{} | train loss: {:.4f}'.format(epoch, args['epoch'], step+1, len(train_loader), running_loss/float(step+1)/mini_batch_size)) # END IF @@ -211,7 +211,7 @@ def train(**args): # END FOR: Epoch - + scheduler.step(epoch=epoch) print('Schedulers lr: %f', scheduler.get_lr()[0]) @@ -257,11 +257,21 @@ def valid(valid_loader, running_acc, model, device, acc_metric): with torch.no_grad(): for step, data in enumerate(valid_loader): - x_input = data['data'].to(device) + x_input = data['data'] annotations = data['annots'] - outputs = model(x_input) + + if isinstance(x_input, torch.Tensor): + outputs = model(x_input.to(device)) + else: + for i, item in enumerate(x_input): + if isinstance(item, torch.Tensor): + x_input[i] = item.to(device) + outputs = model(*x_input) running_acc.append(acc_metric.get_accuracy(outputs, annotations)) + + if step % 100 == 0: + print('Step: {}/{} | validation acc: {:.4f}'.format(step, len(valid_loader), running_acc[-1])) # END FOR: Validation Accuracy From b34b0eec63489428907bf2681212cc5187145434 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 9 Oct 2019 16:23:22 -0400 Subject: [PATCH 37/55] Added download links for weight and data files --- models/dvsa/config_test.yaml | 68 ++++++++++++++++++++++++++++++++++++ weights/download_weights.sh | 6 ++-- 2 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 models/dvsa/config_test.yaml diff --git a/models/dvsa/config_test.yaml b/models/dvsa/config_test.yaml new file mode 100644 index 0000000..49ad0f6 --- /dev/null +++ b/models/dvsa/config_test.yaml @@ -0,0 +1,68 @@ +# Preprocessing +clip_length: -1 # Number of frames within a clip +clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) +clip_stride: 1 # Frame offset between successive frames +crop_shape: [112,112] # (Height, Width) of frame +crop_type: Random # Type of cropping operation (Random, Central and None) +final_shape: [112,112] # (Height, Width) of input to be given to CNN +num_clips: 1 # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) +random_offset: 0 # Boolean switch to generate a clip length sized clip from a video +resize_shape: [128,171] # (Height, Width) to resize original data +sample_duration: 16 # Temporal size of video to be provided as input to the model +sample_size: 112 # Height of frame to be provided as input to the model +subtract_mean: '' # Subtract mean (R,G,B) from all frames during preprocessing + +# Experiment Setup +acc_metric: Box_Accuracy # Accuracy metric +batch_size: 1 # Numbers of videos in a mini-batch +dataset: YC2BB # Name of dataset +debug: 1 # If True, do not plot, save, or create data files +epoch: 30 # Total number of epochs +exp: exp # Experiment name +gamma: 0.5 # Multiplier with which to change learning rate +grad_max_norm: 1 # Norm for gradient clipping +json_path: /path/to/yc2bb # Path to the json file for the given dataset +labels: 67 # Number of total classes in the dataset +load_type: test # Environment selection, to include only training/training and validation/testing dataset +loss_type: YC2BB_Attention_Loss # Loss function +lr: 0.05 # Learning rate +milestones: [10, 20] # Epoch values to change learning rate +model: DVSA # Name of model to be loaded +momentum: 0.9 # Momentum value in optimizer +num_workers: 2 # Number of CPU worker used to load data +opt: sgd # Name of optimizer +preprocess: default # String argument to select preprocessing type +pretrained: 1 # Load pretrained network +pseudo_batch_loop: 1 # Pseudo-batch size multiplier to mimic large minibatches +rerun: 1 # Number of trials to repeat an experiment +save_dir: './results' # Path to results directory +seed: 999 # Seed for reproducibility +weight_decay: 0.0005 # Weight decay + +# Dataset specific config +yc2bb_class_file: '/path/to/yc2bb/data/class_file.csv' #https://github.com/MichiganCOG/Video-Grounding-from-Text/blob/master/data/class_file.csv +yc2bb_num_frm: 5 +yc2bb_num_proposals: 20 +yc2bb_roi_pooled_feat_root: '/path/to/yc2bb/data/yc2/roi_pooled_feat' #roi_pooled feat download links below +#train: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_train.tar.gz (113 GB) +#val: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_val.tar.gz (38 GB) +#test: http://youcook2.eecs.umich.edu/static/dat/yc2_bb/roi_pooled_feat_test.tar.gz (17 GB) +yc2bb_rpn_proposal_root: '/path/to/yc2bb/data/yc2/roi_box' #http://youcook2.eecs.umich.edu/static/dat/yc2_bb/all-box-100.tar.gz + +# Model specific config +attn_drop: 0.2 +dropout: 0.2 +enc_size: 128 +has_loss_weighting: 1 +hidden_size: 256 +input_size: 2048 +loss_factor: 0.9 +n_heads: 4 +n_layers: 1 +#num_class: 67 #NOTE:redundant with labels +obj_interact: 1 +ranking_margin: 0.1 + +# Box accuracy config +accu_thresh: 0.5 +fps: 1 diff --git a/weights/download_weights.sh b/weights/download_weights.sh index 3cbfec6..f5cc2d9 100755 --- a/weights/download_weights.sh +++ b/weights/download_weights.sh @@ -2,9 +2,6 @@ #wget -O [saved_file_name] [direct_download_link] -#GoTurn -wget -O ./weights/goturn.pth.tar https://umich.box.com/shared/static/src6rfm4lpn0v3t4l26d6u0v4ixdwem5.tar - #SSD wget -O ./weights/ssd300_mAP_77.43_v2.pkl https://umich.box.com/shared/static/jszcnnwcvscfyqe3o81xy8qzfbsc20vo.pkl @@ -13,3 +10,6 @@ wget -O ./weights/c3d-pretrained.pth https://umich.box.com/shared/static/znmyt8u #C3D Mean wget -O ./weights/sport1m_train16_128_mean.npy https://umich.box.com/shared/static/ppbnldsa5rty615osdjh2yi8fqcx0a3b.npy + +#YC2BB-Full model +wget -O ./weights/yc2bb_full-model.pth https://umich.box.com/shared/static/5ukbdcawryzkkq4r789z0src6u6uvg3u.pth From caa5db8be8311576bf6ab2b9be3f5310092ad0b8 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 9 Oct 2019 16:29:13 -0400 Subject: [PATCH 38/55] Update README --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 3231468..d46f8a8 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,13 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) | Model Architecture | Dataset | ViP Accuracy (%) | |:--------------------:|:------------------:|:---------------------:| | SSD300 | VOC2007 | 76.58 | + +### Video Object Grounding +| Model Architecture | Dataset | ViP Accuracy (%) | +|:--------------------:|:------------------:|:---------------------:| +| DVSA (+fw, obj) | YC2-BB (Validation) | 30.09 | +fw: framewise weighting +obj: object interaction ## Table of Contents * [Datasets](#configured-datasets) @@ -38,12 +45,14 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php) | Video Object Detection | |[MSCOCO 2014](http://cocodataset.org/#download) | Object Detection, Keypoints| |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) | Object Detection, Classification| +|[YC2-BB](http://youcook2.eecs.umich.edu/download)| Video Object Grounding| ## Models | Model | Task(s) | |:------------------------------------------------:|:--------------------:| |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition | |[SSD300](https://github.com/amdegroot/ssd.pytorch) | Object Detection | +|[DVSA (+fw, obj)](https://github.com/MichiganCOG/Video-Grounding-from-Text)| Video Object Grounding| ## Requirements From 3103540019adcdd93963ae8b18a8f95f58a67b51 Mon Sep 17 00:00:00 2001 From: natlouis <38472719+natlouis@users.noreply.github.com> Date: Wed, 9 Oct 2019 20:06:50 -0400 Subject: [PATCH 39/55] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d46f8a8..359757b 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) | Model Architecture | Dataset | ViP Accuracy (%) | |:--------------------:|:------------------:|:---------------------:| | DVSA (+fw, obj) | YC2-BB (Validation) | 30.09 | -fw: framewise weighting -obj: object interaction + +**fw**: framewise weighting, **obj**: object interaction ## Table of Contents * [Datasets](#configured-datasets) From bcdc19f49f80c830b74c4ec311d0f0ed6bb20816 Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Wed, 9 Oct 2019 20:55:01 -0400 Subject: [PATCH 40/55] small fix --- datasets/YC2BB.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/YC2BB.py b/datasets/YC2BB.py index 0840a06..10266ca 100644 --- a/datasets/YC2BB.py +++ b/datasets/YC2BB.py @@ -147,7 +147,7 @@ def __getitem__(self, idx): bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] else: if obj['occ'] or obj['outside']: - bbox_data[trackid, frame_ind] = [label, -1, -1, -1, -1] + bbox_data[trackid, frame_ind] = [-1, -1, -1, -1, -1] else: obj_bbox = obj['bbox'] # [xmin, ymin, xmax, ymax] From 19ba9db578648dbf98c50dde8c6963d826dca98c Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Thu, 10 Oct 2019 14:33:01 -0400 Subject: [PATCH 41/55] Add scaling parse args and description --- datasets/preprocessing_transforms.py | 1 + parse_args.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/datasets/preprocessing_transforms.py b/datasets/preprocessing_transforms.py index 78d0f4a..3bf74d0 100644 --- a/datasets/preprocessing_transforms.py +++ b/datasets/preprocessing_transforms.py @@ -677,6 +677,7 @@ class RandomZoomClip(PreprocTransform): """ Random zoom on all frames in a clip. All frames receive same scaling Scale will be bounded by object bounding box (if given). Meaning, object will always be in view + If zooming out, the borders will be filled with black. >1: Zoom in <1: Zoom out diff --git a/parse_args.py b/parse_args.py index 8c90b31..d2d240d 100644 --- a/parse_args.py +++ b/parse_args.py @@ -47,6 +47,8 @@ def __init__(self): parser.add_argument('--crop_shape', type=int, nargs=2, help='(Height, Width) of frame') parser.add_argument('--crop_type', type=str, help='Type of cropping operation (Random, Center and None)') parser.add_argument('--num_clips', type=int, help='Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips)') + parser.add_argument('--scale', type=float, nargs=2, help='[min scale, max scale] amounts to randomly scale videos for augmentation purposes. scale >1 zooms in and scale <1 zooms out. ') + parser.add_argument('--debug', type=int, help='Run an experiment but do not save any data or create any folders') parser.add_argument('--seed', type=int, help='Seed for reproducibility') @@ -76,7 +78,8 @@ def __init__(self): crop_type = None, num_clips = 1, debug = 0, - seed = 0) + seed = 0, + scale = [1,1]) From 009c1446de9e8cc94676acc3c5c79feb8cdce4eb Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 10 Oct 2019 14:55:06 -0400 Subject: [PATCH 42/55] update requirements --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 5fa42ed..48e72fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,6 @@ tensorboardX==1.8 scipy==1.3.0 torch==1.1.0 torchvision==0.3.0 + +torchtext==0.2.1 +spacy==2.1.8 #Install 'en' package with: python -m spacy download en From cc1aaa2ab607e955b581c014faa935f24c4f72cd Mon Sep 17 00:00:00 2001 From: Nathan Louis Date: Thu, 10 Oct 2019 15:03:21 -0400 Subject: [PATCH 43/55] Auto-install spacy package --- install.sh | 1 + requirements.txt | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/install.sh b/install.sh index eda0fe7..c443541 100755 --- a/install.sh +++ b/install.sh @@ -1,4 +1,5 @@ #!/bin/bash pip3 install -r requirements.txt +python -m spacy download en ./weights/download_weights.sh diff --git a/requirements.txt b/requirements.txt index 48e72fe..de36ea1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,4 @@ torch==1.1.0 torchvision==0.3.0 torchtext==0.2.1 -spacy==2.1.8 #Install 'en' package with: python -m spacy download en +spacy==2.1.8 From 0ef84c31644bed792ca28686f53c3c51131a8711 Mon Sep 17 00:00:00 2001 From: Madan Date: Tue, 22 Oct 2019 15:19:08 -0400 Subject: [PATCH 44/55] resolving issue of resetting validation accuracy by moving metric init into validation function --- train.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/train.py b/train.py index 7f66ef4..e2e72e8 100644 --- a/train.py +++ b/train.py @@ -126,7 +126,6 @@ def train(**args): # END IF model_loss = Losses(device=device, **args) - acc_metric = Metrics(**args) best_val_acc = 0.0 ############################################################################################################################################################################ @@ -214,7 +213,7 @@ def train(**args): ## START FOR: Validation Accuracy running_acc = [] - running_acc = valid(valid_loader, running_acc, model, device, acc_metric) + running_acc = valid(valid_loader, running_acc, model, device) if not args['debug']: writer.add_scalar(args['dataset']+'/'+args['model']+'/validation_accuracy', 100.*running_acc[-1], epoch*len(train_loader) + step) @@ -242,9 +241,10 @@ def train(**args): # Close Tensorboard Element writer.close() -def valid(valid_loader, running_acc, model, device, acc_metric): +def valid(valid_loader, running_acc, model, device): + acc_metric = Metrics(**args) model.eval() - + with torch.no_grad(): for step, data in enumerate(valid_loader): x_input = data['data'].to(device) From 901a7e0d8757ea493819b67252697f9ba0fd6f23 Mon Sep 17 00:00:00 2001 From: Madan Date: Tue, 22 Oct 2019 15:33:44 -0400 Subject: [PATCH 45/55] Removing extra code pertaining to feature extraction --- eval.py | 23 +---------------------- models/c3d/c3d.py | 5 ----- models/c3d/config_test.yaml | 2 -- models/c3d/config_train.yaml | 2 -- 4 files changed, 1 insertion(+), 31 deletions(-) diff --git a/eval.py b/eval.py index 8b1a57f..0df5a15 100644 --- a/eval.py +++ b/eval.py @@ -91,28 +91,13 @@ def eval(**args): # Setup Model To Evaluate model.eval() - ret_data = None - ret_labels = None - with torch.no_grad(): for step, data in enumerate(eval_loader): x_input = data['data'].to(device) annotations = data['annots'] outputs = model(x_input) - - if ret_data is None: - ret_data = outputs.cpu().numpy() - ret_labels = annotations['labels'].cpu().numpy()[:, 0] - - else: - ret_data = np.vstack((ret_data, outputs.cpu().numpy())) - ret_labels = np.hstack((ret_labels, annotations['labels'].cpu().numpy()[:, 0])) - - # END IF - - - acc = acc_metric.get_accuracy(outputs, annotations) + acc = acc_metric.get_accuracy(outputs, annotations) if step % 100 == 0: print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc)) @@ -120,12 +105,6 @@ def eval(**args): print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc)) if not args['debug']: - ret_dict = {} - ret_dict['data'] = ret_data - ret_dict['labels'] = ret_labels - import scipy.io as sio - sio.savemat(os.path.join(result_dir,args['load_type']+'_'+args['dataset']+'.mat'), ret_dict) - writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc) # Close Tensorboard Element writer.close() diff --git a/models/c3d/c3d.py b/models/c3d/c3d.py index dfd78fc..69e4a74 100644 --- a/models/c3d/c3d.py +++ b/models/c3d/c3d.py @@ -55,8 +55,6 @@ def __init__(self, **kwargs): if isinstance(kwargs['pretrained'], int) and kwargs['pretrained']: self.__load_pretrained_weights() - self.features = kwargs['model_features'] - def forward(self, x, labels=False): x = self.relu(self.conv1(x)) x = self.pool1(x) @@ -80,9 +78,6 @@ def forward(self, x, labels=False): x = self.relu(self.fc6(x)) - if self.features: - return x - x = self.dropout(x) x = self.relu(self.fc7(x)) x = self.dropout(x) diff --git a/models/c3d/config_test.yaml b/models/c3d/config_test.yaml index 08f4e1c..d95e622 100644 --- a/models/c3d/config_test.yaml +++ b/models/c3d/config_test.yaml @@ -36,5 +36,3 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay - -model_features: 0 # 1 - return model features (before prediction), 0 - return model prediction output diff --git a/models/c3d/config_train.yaml b/models/c3d/config_train.yaml index 65a0cb2..4456e0d 100644 --- a/models/c3d/config_train.yaml +++ b/models/c3d/config_train.yaml @@ -36,5 +36,3 @@ rerun: 1 # Number of trials to repeat an experim save_dir: './results' # Path to results directory seed: 999 # Seed for reproducibility weight_decay: 0.0005 # Weight decay - -model_features: 0 # 1 - return model features (before prediction), 0 - return model prediction output From 2f26a0e1a2caa430b90cb53679efcc552628079c Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Thu, 24 Oct 2019 19:49:42 -0400 Subject: [PATCH 46/55] Add dataset loader and a test to load a video --- datasets/DHF1K.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 datasets/DHF1K.py diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py new file mode 100644 index 0000000..d186810 --- /dev/null +++ b/datasets/DHF1K.py @@ -0,0 +1,104 @@ +import torch +try: + from .abstract_datasets import DetectionDataset +except: + from abstract_datasets import DetectionDataset +import cv2 +import os +import numpy as np +import json +try: + import datasets.preprocessing_transforms as pt +except: + import preprocessing_transforms as pt + +class DHF1K(DetectionDataset): + def __init__(self, *args, **kwargs): + super(DHF1K, self).__init__(*args, **kwargs) + + # Get model object in case preprocessing other than default is used + self.model_object = kwargs['model_obj'] + self.load_type = kwargs['load_type'] + + print(self.load_type) + if self.load_type=='train': + self.transforms = kwargs['model_obj'].train_transforms + + else: + self.transforms = kwargs['model_obj'].test_transforms + + + + + def __getitem__(self, idx): + vid_info = self.samples[idx] + + + base_path = vid_info['base_path'] + vid_size = vid_info['frame_size'] + + input_data = [] + map_data = [] + bin_data = [] + + for frame_ind in range(len(vid_info['frames'])): + frame = vid_info['frames'][frame_ind] + frame_path = frame['img_path'] + map_path = frame['map_path'] + bin_path = frame['bin_path'] + + # Load frame, convert to RGB from BGR and normalize from 0 to 1 + input_data.append(cv2.imread(os.path.join(base_path, frame_path))[...,::-1]/255.) + + # Load frame, Normalize from 0 to 1 + # All frame channels have repeated values + map_data.append(cv2.imread(map_path)/255.) + bin_data.append(cv2.imread(bin_path)/255.) + + + + vid_data = self.transforms(input_data) + + # Annotations must be resized in the loss/metric + map_data = torch.Tensor(map_data) + bin_data = torch.Tensor(bin_data) + + # Permute the PIL dimensions (Frame, Height, Width, Chan) to pytorch (Chan, frame, height, width) + vid_data = vid_data.permute(3, 0, 1, 2) + map_data = map_data.permute(3, 0, 1, 2) + bin_data = bin_data.permute(3, 0, 1, 2) + # All channels are repeated so remove the unnecessary channels + map_data = map_data[0].unsqueeze(0) + bin_data = bin_data[0].unsqueeze(0) + + + ret_dict = dict() + ret_dict['data'] = vid_data + + annot_dict = dict() + annot_dict['map'] = map_data + annot_dict['bin'] = bin_data + annot_dict['input_shape'] = vid_data.size() + annot_dict['name'] = base_path + ret_dict['annots'] = annot_dict + + return ret_dict + + +if __name__=='__main__': + + class tts(): + def __call__(self, x): + return pt.ToTensorClip()(x) + class debug_model(): + def __init__(self): + self.train_transforms = tts() + dataset = DHF1K(model_obj=debug_model(), json_path='/z/home/erichof/datasets/DHF1K', load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1) + train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False) + + + import matplotlib.pyplot as plt + for x in enumerate(train_loader): + plt.imshow(x[1]['data'][0,:,0].permute(1,2,0).numpy()) + #plt.show() + import pdb; pdb.set_trace() From 46cb1823dec95c3ba65d7497b0eed2b330141a34 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Thu, 24 Oct 2019 19:52:08 -0400 Subject: [PATCH 47/55] Add json generation script --- datasets/DHF1K.py | 3 +- datasets/scripts/gen_json_DHF1K.py | 74 ++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 datasets/scripts/gen_json_DHF1K.py diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py index d186810..467bb15 100644 --- a/datasets/DHF1K.py +++ b/datasets/DHF1K.py @@ -93,7 +93,8 @@ def __call__(self, x): class debug_model(): def __init__(self): self.train_transforms = tts() - dataset = DHF1K(model_obj=debug_model(), json_path='/z/home/erichof/datasets/DHF1K', load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1) + json_path = '/path/to/json' + dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1) train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False) diff --git a/datasets/scripts/gen_json_DHF1K.py b/datasets/scripts/gen_json_DHF1K.py new file mode 100644 index 0000000..02c05eb --- /dev/null +++ b/datasets/scripts/gen_json_DHF1K.py @@ -0,0 +1,74 @@ +import os +import cv2 +import json + + +def get_split(base_vid_path): + vids = os.listdir(base_vid_path) + vids = [int(vid) for vid in vids] + vids.sort() + + # Out of the 1000 videos, the first 600 are annotated for training, 601-700 annotated for val, 701-1000 not annotated must be sent in to test + train_cutoff = 600 + val_cutoff = 700 + train_vids = vids[:vids.index(600)+1] + val_vids = vids[vids.index(600)+1:vids.index(700)+1] + test_vids = vids[vids.index(700)+1:] + + train_vids = [str(vid).zfill(3) for vid in train_vids] + test_vids = [str(vid).zfill(3) for vid in test_vids] + val_vids = [str(vid).zfill(3) for vid in val_vids] + annot_train_vids = [vid.zfill(4) for vid in train_vids] + annot_val_vids = [vid.zfill(4) for vid in val_vids] + return train_vids, test_vids, val_vids, annot_train_vids, annot_val_vids + + +def save_json(load_type): + base_vid_path = '/path/to/DHF1K/video_png' + base_annot_path = '/path/to/DHF1K/annotation' + output_path = '/any/path/' + + train_vids, test_vids, val_vids, annot_train, annot_val = get_split(base_vid_path) + + if load_type == 'train': + tv_vids = train_vids + tv_ann = annot_train + elif load_type == 'val': + tv_vids = val_vids + tv_ann = annot_val + + else: + tv_vids = test_vids + tv_ann = [] + + json_dat = [] + for vid in sorted(tv_vids): + vid_dict = {} + frames = [] + frame_size = [] + for img in sorted(os.listdir(os.path.join(base_vid_path, vid))): + if frame_size == []: + frame_shape = cv2.imread(os.path.join(base_vid_path, vid, img)).shape + frame_size = [frame_shape[1], frame_shape[0]] # Width, Height + frame_dict = {} + frame_dict['img_path'] = img + if load_type != 'test': + frame_dict['map_path'] = os.path.join(base_annot_path, tv_ann[tv_vids.index(vid)], 'maps', img) + frame_dict['bin_path'] = os.path.join(base_annot_path, tv_ann[tv_vids.index(vid)], 'fixation', img) + else: + frame_dict['map_path'] = '' + frame_dict['bin_path'] = '' + + frames.append(frame_dict) + vid_dict['base_path'] = os.path.join(base_vid_path, vid) + vid_dict['frames'] = frames + vid_dict['frame_size'] = frame_size + json_dat.append(vid_dict) + + writef = open(os.path.join(output_path,load_type+'.json'), 'w') + json.dump(json_dat, writef) + writef.close() + +save_json('train') +save_json('val') +save_json('test') From d2d406994c08287a3e57c5f2764f8408a2300065 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Thu, 24 Oct 2019 20:27:13 -0400 Subject: [PATCH 48/55] Add i3d model and weights links and train/test scripts --- .gitignore | 1 + README.md | 2 + models/i3d/config_test.yaml | 27 +++ models/i3d/config_train.yaml | 37 +++ models/i3d/i3d.py | 447 +++++++++++++++++++++++++++++++++++ weights/download_weights.sh | 6 + 6 files changed, 520 insertions(+) create mode 100644 models/i3d/config_test.yaml create mode 100644 models/i3d/config_train.yaml create mode 100644 models/i3d/i3d.py diff --git a/.gitignore b/.gitignore index c70c4d2..0e4172d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ runs/* models/HGC3D *.json pbs/* +weights/* diff --git a/README.md b/README.md index 3231468..74bc01e 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) ### Recognition | Model Architecture | Dataset | ViP Accuracy (%) | |:--------------------:|:------------------:|:---------------------:| +| I3D | HMDB51 (Split 1) | 72.75 | | C3D | HMDB51 (Split 1) | 50.14 ± 0.777 | | C3D | UCF101 (Split 1) | 80.40 ± 0.399 | @@ -43,6 +44,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) | Model | Task(s) | |:------------------------------------------------:|:--------------------:| |[C3D](https://github.com/jfzhang95/pytorch-video-recognition/blob/master/network/C3D_model.py) | Activity Recognition | +|[I3D](https://github.com/piergiaj/pytorch-i3d) | Activity Recognition | |[SSD300](https://github.com/amdegroot/ssd.pytorch) | Object Detection | ## Requirements diff --git a/models/i3d/config_test.yaml b/models/i3d/config_test.yaml new file mode 100644 index 0000000..72ae825 --- /dev/null +++ b/models/i3d/config_test.yaml @@ -0,0 +1,27 @@ +# Preprocessing +clip_length: 64 # Number of frames within a clip +clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) +clip_stride: 0 # Frame offset between successive frames +crop_shape: [224,224] # (Height, Width) of frame +crop_type: Center # Type of cropping operation (Random, Central and None) +final_shape: [224,224] # (Height, Width) of input to be given to CNN +num_clips: -1 # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) +random_offset: 0 # Boolean switch to generate a clip length sized clip from a video +resize_shape: [230,250] # (Height, Width) to resize original data +subtract_mean: [123,117,104] # Subtract mean (R,G,B) from all frames during preprocessing + +# Experiment Setup +acc_metric: Accuracy # Accuracy metric +batch_size: 1 # Numbers of videos in a mini-batch +dataset: HMDB51 # Name of dataset +exp: I3D # Experiment name +json_path: /z/dat/HMDB51/ # Path to the json file for the given dataset +labels: 51 # Number of total classes in the dataset +load_type: train_val # Environment selection, to include only training/training and validation/testing dataset +model: I3D # Name of model to be loaded +num_workers: 5 # Number of CPU worker used to load data +preprocess: default # String argument to select preprocessing type +pretrained: 'weights/i3d_rgb_imagenet_then_HMDB51_30epochs.pkl' # Load pretrained network +save_dir: './results' # Path to results directory +seed: 999 # Seed for reproducibility +loss_type: M_XENTROPY # Loss function diff --git a/models/i3d/config_train.yaml b/models/i3d/config_train.yaml new file mode 100644 index 0000000..c15abe9 --- /dev/null +++ b/models/i3d/config_train.yaml @@ -0,0 +1,37 @@ +# Preprocessing +clip_length: 64 # Number of frames within a clip +clip_offset: 0 # Frame offset between beginning of video and clip (1st clip only) +clip_stride: 1 # Frame offset between successive frames +crop_shape: [224,224] # (Height, Width) of frame +crop_type: Center # Type of cropping operation (Random, Central and None) +final_shape: [224,224] # (Height, Width) of input to be given to CNN +num_clips: -1 # Number clips to be generated from a video (<0: uniform sampling, 0: Divide entire video into clips, >0: Defines number of clips) +random_offset: 0 # Boolean switch to generate a clip length sized clip from a video +resize_shape: [230,250] # (Height, Width) to resize original data +subtract_mean: [123,117,104] # Subtract mean (R,G,B) from all frames during preprocessing + +# Experiment Setup +acc_metric: Accuracy # Accuracy metric +batch_size: 5 # Numbers of videos in a mini-batch +pseudo_batch_loop: 10 # Pseudo-batch size multiplier to mimic large minibatches +dataset: HMDB51 # Name of dataset +epoch: 30 # Total number of epochs +exp: I3D # Experiment name +gamma: 0.1 # Multiplier with which to change learning rate +json_path: /z/dat/HMDB51/ # Path to the json file for the given dataset +labels: 51 # Number of total classes in the dataset +load_type: train # Environment selection, to include only training/training and validation/testing dataset +loss_type: M_XENTROPY # Loss function +lr: 0.01 # Learning rate +milestones: [10, 20] # Epoch values to change learning rate +model: I3D # Name of model to be loaded +momentum: 0.9 # Momentum value in optimizer +num_workers: 5 # Number of CPU worker used to load data +opt: sgd # Name of optimizer +preprocess: default # String argument to select preprocessing type +pretrained: 1 # Load pretrained network +rerun: 1 # Number of trials to repeat an experiment +save_dir: './results' # Path to results directory +seed: 999 # Seed for reproducibility +weight_decay: 0.0005 # Weight decay +grad_max_norm: 100 diff --git a/models/i3d/i3d.py b/models/i3d/i3d.py new file mode 100644 index 0000000..4e901b3 --- /dev/null +++ b/models/i3d/i3d.py @@ -0,0 +1,447 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +import datasets.preprocessing_transforms as pt + +import numpy as np + +import os +import sys +from collections import OrderedDict + + +""" +Code from the implementation of i3d by AJ Piergiovanni: https://github.com/piergiaj/pytorch-i3d +""" + +class MaxPool3dSamePadding(nn.MaxPool3d): + + def compute_pad(self, dim, s): + if s % self.stride[dim] == 0: + return max(self.kernel_size[dim] - self.stride[dim], 0) + else: + return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) + + def forward(self, x): + # compute 'same' padding + (batch, channel, t, h, w) = x.size() + #print t,h,w + out_t = np.ceil(float(t) / float(self.stride[0])) + out_h = np.ceil(float(h) / float(self.stride[1])) + out_w = np.ceil(float(w) / float(self.stride[2])) + #print out_t, out_h, out_w + pad_t = self.compute_pad(0, t) + pad_h = self.compute_pad(1, h) + pad_w = self.compute_pad(2, w) + #print pad_t, pad_h, pad_w + + pad_t_f = pad_t // 2 + pad_t_b = pad_t - pad_t_f + pad_h_f = pad_h // 2 + pad_h_b = pad_h - pad_h_f + pad_w_f = pad_w // 2 + pad_w_b = pad_w - pad_w_f + + pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) + x = F.pad(x, pad) + return super(MaxPool3dSamePadding, self).forward(x) + + +class Unit3D(nn.Module): + + def __init__(self, in_channels, + output_channels, + kernel_shape=(1, 1, 1), + stride=(1, 1, 1), + padding=0, + activation_fn=F.relu, + use_batch_norm=True, + use_bias=False, + name='unit_3d', + dilation=1): + + """Initializes Unit3D module.""" + super(Unit3D, self).__init__() + + self._output_channels = output_channels + self._kernel_shape = kernel_shape + self._stride = stride + self._use_batch_norm = use_batch_norm + self._activation_fn = activation_fn + self._use_bias = use_bias + self.name = name + self.padding = padding + + self.conv3d = nn.Conv3d(in_channels=in_channels, + out_channels=self._output_channels, + kernel_size=self._kernel_shape, + stride=self._stride, + padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function + bias=self._use_bias, + dilation=dilation) + + if self._use_batch_norm: + self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) + + def compute_pad(self, dim, s): + if s % self._stride[dim] == 0: + return max(self._kernel_shape[dim] - self._stride[dim], 0) + else: + return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) + + + def forward(self, x): + # compute 'same' padding + (batch, channel, t, h, w) = x.size() + #print t,h,w + out_t = np.ceil(float(t) / float(self._stride[0])) + out_h = np.ceil(float(h) / float(self._stride[1])) + out_w = np.ceil(float(w) / float(self._stride[2])) + #print out_t, out_h, out_w + pad_t = self.compute_pad(0, t) + pad_h = self.compute_pad(1, h) + pad_w = self.compute_pad(2, w) + #print pad_t, pad_h, pad_w + + pad_t_f = pad_t // 2 + pad_t_b = pad_t - pad_t_f + pad_h_f = pad_h // 2 + pad_h_b = pad_h - pad_h_f + pad_w_f = pad_w // 2 + pad_w_b = pad_w - pad_w_f + + pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b) + x = F.pad(x, pad) + + x = self.conv3d(x) + if self._use_batch_norm: + x = self.bn(x) + if self._activation_fn is not None: + x = self._activation_fn(x) + return x + + + +class InceptionModule(nn.Module): + def __init__(self, in_channels, out_channels, name): + super(InceptionModule, self).__init__() + + self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, + name=name+'/Branch_0/Conv3d_0a_1x1') + self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, + name=name+'/Branch_1/Conv3d_0a_1x1') + self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], + name=name+'/Branch_1/Conv3d_0b_3x3') + self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, + name=name+'/Branch_2/Conv3d_0a_1x1') + self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], + name=name+'/Branch_2/Conv3d_0b_3x3') + self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], + stride=(1, 1, 1), padding=0) + self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, + name=name+'/Branch_3/Conv3d_0b_1x1') + self.name = name + + def forward(self, x): + b0 = self.b0(x) + b1 = self.b1b(self.b1a(x)) + b2 = self.b2b(self.b2a(x)) + b3 = self.b3b(self.b3a(x)) + return torch.cat([b0,b1,b2,b3], dim=1) + + +class I3D(nn.Module): + """Inception-v1 I3D architecture. + The model is introduced in: + Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset + Joao Carreira, Andrew Zisserman + https://arxiv.org/pdf/1705.07750v1.pdf. + See also the Inception architecture, introduced in: + Going deeper with convolutions + Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, + Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. + http://arxiv.org/pdf/1409.4842v1.pdf. + """ + + # Endpoints of the model in order. During construction, all the endpoints up + # to a designated `final_endpoint` are returned in a dictionary as the + # second return value. + VALID_ENDPOINTS = ( + 'Conv3d_1a_7x7', + 'MaxPool3d_2a_3x3', + 'Conv3d_2b_1x1', + 'Conv3d_2c_3x3', + 'MaxPool3d_3a_3x3', + 'Mixed_3b', + 'Mixed_3c', + 'MaxPool3d_4a_3x3', + 'Mixed_4b', + 'Mixed_4c', + 'Mixed_4d', + 'Mixed_4e', + 'Mixed_4f', + 'MaxPool3d_5a_2x2', + 'Mixed_5b', + 'Mixed_5c', + 'Logits', + 'Predictions', + ) + + def __init__(self, spatial_squeeze=True, + final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5, **kwargs): + """Initializes I3D model instance. + Args: + num_classes: The number of outputs in the logit layer (default 400, which + matches the Kinetics dataset). + spatial_squeeze: Whether to squeeze the spatial dimensions for the logits + before returning (default True). + final_endpoint: The model contains many possible endpoints. + `final_endpoint` specifies the last endpoint for the model to be built + up to. In addition to the output at `final_endpoint`, all the outputs + at endpoints up to `final_endpoint` will also be returned, in a + dictionary. `final_endpoint` must be one of + InceptionI3d.VALID_ENDPOINTS (default 'Logits'). + name: A string (optional). The name of this module. + Raises: + ValueError: if `final_endpoint` is not recognized. + """ + + if final_endpoint not in self.VALID_ENDPOINTS: + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + super(I3D, self).__init__() + self._num_classes = kwargs['labels'] + self._spatial_squeeze = spatial_squeeze + self._final_endpoint = final_endpoint + self.logits = None + + self.train_transforms = PreprocessTrain(**kwargs) + self.test_transforms = PreprocessEval(**kwargs) + + + if self._final_endpoint not in self.VALID_ENDPOINTS: + raise ValueError('Unknown final endpoint %s' % self._final_endpoint) + + self.end_points = {} + end_point = 'Conv3d_1a_7x7' + self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], + stride=(2, 2, 2), padding=(3,3,3), name=name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'MaxPool3d_2a_3x3' + self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), + padding=0) + if self._final_endpoint == end_point: return + + end_point = 'Conv3d_2b_1x1' + self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, + name=name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Conv3d_2c_3x3' + self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, + name=name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'MaxPool3d_3a_3x3' + self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), + padding=0) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_3b' + self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_3c' + self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'MaxPool3d_4a_3x3' + self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), + padding=0) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_4b' + self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_4c' + self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_4d' + self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_4e' + self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_4f' + self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'MaxPool3d_5a_2x2' + self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), + padding=0) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_5b' + self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Mixed_5c' + self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point) + if self._final_endpoint == end_point: return + + end_point = 'Logits' + self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], + stride=(1, 1, 1)) + self.dropout = nn.Dropout(dropout_keep_prob) + self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes, + kernel_shape=[1, 1, 1], + padding=0, + activation_fn=None, + use_batch_norm=False, + use_bias=True, + name='logits') + + + + + + self.build() + + if 'pretrained' in kwargs.keys() and kwargs['pretrained']: + if 'i3d_pretrained' in kwargs.keys(): + self._load_checkpoint(kwargs['i3d_pretrained']) + else: + self._load_pretrained_weights() + + def _load_pretrained_weights(self): + p_dict = torch.load('weights/i3d_rgb_imagenet.pt') + s_dict = self.state_dict() + for name in p_dict: + if name in s_dict.keys(): + if p_dict[name].shape == s_dict[name].shape: + s_dict[name] = p_dict[name] + + self.load_state_dict(s_dict) + + def _load_checkpoint(self, saved_weights): + p_dict = torch.load(saved_weights)['state_dict'] + s_dict = self.state_dict() + for name in p_dict: + if name in s_dict.keys(): + if p_dict[name].shape == s_dict[name].shape: + s_dict[name] = p_dict[name] + + self.load_state_dict(s_dict) + + + + def replace_logits(self, num_classes): + self._num_classes = num_classes + self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes, + kernel_shape=[1, 1, 1], + padding=0, + activation_fn=None, + use_batch_norm=False, + use_bias=True, + name='logits') + + + def build(self): + for k in self.end_points.keys(): + self.add_module(k, self.end_points[k]) + + def forward(self, x): + for end_point in self.VALID_ENDPOINTS: + if end_point in self.end_points: + x = self._modules[end_point](x) # use _modules to work with dataparallel + + x = self.logits(self.dropout(self.avg_pool(x))) + + if self._spatial_squeeze: + logits = x.squeeze(3).squeeze(3) + # logits is batch X classes X time, which is what we want to work with + + logits = torch.mean(logits, dim=2) + return logits + + + def extract_features(self, x): + for end_point in self.VALID_ENDPOINTS: + if end_point in self.end_points: + x = self._modules[end_point](x) + return self.avg_pool(x) + +class PreprocessTrain(object): + """ + Container for all transforms used to preprocess clips for training in this dataset. + """ + def __init__(self, **kwargs): + """ + Initialize preprocessing class for training set + Args: + preprocess (String): Keyword to select different preprocessing types + crop_type (String): Select random or central crop + + Return: + None + """ + + self.transforms = [] + self.transforms1 = [] + self.preprocess = kwargs['preprocess'] + crop_type = kwargs['crop_type'] + + + self.transforms.append(pt.ResizeClip(**kwargs)) + + if crop_type == 'Random': + self.transforms.append(pt.RandomCropClip(**kwargs)) + + else: + self.transforms.append(pt.CenterCropClip(**kwargs)) + + self.transforms.append(pt.SubtractRGBMean(**kwargs)) + self.transforms.append(pt.RandomFlipClip(direction='h', p=0.5, **kwargs)) + self.transforms.append(pt.ToTensorClip(**kwargs)) + + def __call__(self, input_data): + for transform in self.transforms: + input_data = transform(input_data) + + return input_data + + +class PreprocessEval(object): + """ + Container for all transforms used to preprocess clips for training in this dataset. + """ + def __init__(self, **kwargs): + """ + Initialize preprocessing class for training set + Args: + preprocess (String): Keyword to select different preprocessing types + crop_type (String): Select random or central crop + + Return: + None + """ + + self.transforms = [] + + self.transforms.append(pt.ResizeClip(**kwargs)) + self.transforms.append(pt.CenterCropClip(**kwargs)) + self.transforms.append(pt.SubtractRGBMean(**kwargs)) + self.transforms.append(pt.ToTensorClip(**kwargs)) + + + def __call__(self, input_data): + for transform in self.transforms: + input_data = transform(input_data) + + return input_data diff --git a/weights/download_weights.sh b/weights/download_weights.sh index 3cbfec6..aef001d 100755 --- a/weights/download_weights.sh +++ b/weights/download_weights.sh @@ -13,3 +13,9 @@ wget -O ./weights/c3d-pretrained.pth https://umich.box.com/shared/static/znmyt8u #C3D Mean wget -O ./weights/sport1m_train16_128_mean.npy https://umich.box.com/shared/static/ppbnldsa5rty615osdjh2yi8fqcx0a3b.npy + +#I3D pretrained on ImageNet and then Kinetics by original authors +wget -O ./weights/i3d_rgb_imagenet.pt https://umich.box.com/shared/static/5m6dwwepzdcw3kjhx7s0peb59lbcde0s.pt + +#I3D pretrained on ImageNet, Kinetics, then on HMDB51 in ViP +wget -O ./weights/i3d_rgb_imagenet_then_HMDB51_30epochs.pkl https://umich.box.com/shared/static/x8x83sw4htidxsxgtus9nt00f383mmm7.pkl From 58b32a7e66c15a5cbd8acf128aebcbae252c63a2 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Thu, 24 Oct 2019 20:30:11 -0400 Subject: [PATCH 49/55] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3231468..fb1661f 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) |[ImageNetVID](http://bvisionweb1.cs.unc.edu/ilsvrc2015/download-videos-3j16.php) | Video Object Detection | |[MSCOCO 2014](http://cocodataset.org/#download) | Object Detection, Keypoints| |[VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) | Object Detection, Classification| +|[DHF1K](https://github.com/wenguanwang/DHF1K) | Video Saliency Prediction| ## Models | Model | Task(s) | From f017c00da6d24798803a46aebc450b04c5196cee Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 25 Oct 2019 08:38:33 -0400 Subject: [PATCH 50/55] Complete DHF1K test --- .gitignore | 1 + datasets/DHF1K.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index c70c4d2..0e4172d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ runs/* models/HGC3D *.json pbs/* +weights/* diff --git a/datasets/DHF1K.py b/datasets/DHF1K.py index 467bb15..01e05f7 100644 --- a/datasets/DHF1K.py +++ b/datasets/DHF1K.py @@ -93,13 +93,21 @@ def __call__(self, x): class debug_model(): def __init__(self): self.train_transforms = tts() - json_path = '/path/to/json' + + + json_path = '/path/to/DHF1K' #### Change this when testing #### + + dataset = DHF1K(model_obj=debug_model(), json_path=json_path, load_type='train', clip_length=16, clip_offset=0, clip_stride=1, num_clips=0, random_offset=0, resize_shape=0, crop_shape=0, crop_type='Center', final_shape=0, batch_size=1) train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=1, shuffle=False) import matplotlib.pyplot as plt for x in enumerate(train_loader): - plt.imshow(x[1]['data'][0,:,0].permute(1,2,0).numpy()) - #plt.show() + dat = x[1]['data'][0,:,0].permute(1,2,0).numpy() + bin = x[1]['annots']['bin'][0,:,0].permute(1,2,0).numpy().repeat(3,axis=2) + map = x[1]['annots']['map'][0,:,0].permute(1,2,0).numpy().repeat(3, axis=2) + img = np.concatenate([dat,bin,map], axis=0) + plt.imshow(img) + plt.show() import pdb; pdb.set_trace() From 92ad95b3c9cdc64b267e3ee736921dbdacd04dcf Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 25 Oct 2019 08:52:06 -0400 Subject: [PATCH 51/55] Update README.md Add citation and link to arXiv paper --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3231468..861885a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Video Platform for Recognition and Detection in Pytorch +# [Video Platform for Recognition and Detection in Pytorch](https://arxiv.org/abs/1910.02793) A platform for quick and easy development of deep learning networks for recognition and detection in videos. Includes popular models like C3D and SSD. @@ -16,6 +16,22 @@ Check out our [wiki!](https://github.com/MichiganCOG/ViP/wiki) | Model Architecture | Dataset | ViP Accuracy (%) | |:--------------------:|:------------------:|:---------------------:| | SSD300 | VOC2007 | 76.58 | + +## Citation + +Please cite ViP when releasing any work that used this platform: https://arxiv.org/abs/1910.02793 + +``` +@article{ganesh2019vip, + title={ViP: Video Platform for PyTorch}, + author={Ganesh, Madan Ravi and Hofesmann, Eric and Louis, Nathan and Corso, Jason}, + journal={arXiv preprint arXiv:1910.02793}, + year={2019} +} + +``` + + ## Table of Contents * [Datasets](#configured-datasets) From d9b926af18f24d521bf579b726eb989d4556f2ae Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 25 Oct 2019 16:13:22 -0400 Subject: [PATCH 52/55] Ignore .pt files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c70c4d2..5dbdb9f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ runs/* models/HGC3D *.json pbs/* +*.pt From ba7c110fb399474505ce7160d54f707c8ceddced Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 25 Oct 2019 16:14:31 -0400 Subject: [PATCH 53/55] Ignore .pt files --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0e4172d..5dbdb9f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ runs/* models/HGC3D *.json pbs/* -weights/* +*.pt From 8e91e396307cdcb9bc6e9661aed0bf9dabb75442 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 25 Oct 2019 16:16:14 -0400 Subject: [PATCH 54/55] remove .pt --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0e4172d..5dbdb9f 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ runs/* models/HGC3D *.json pbs/* -weights/* +*.pt From 0870ee5da8f84df8c6e32ba4279bf5f4db66b5c0 Mon Sep 17 00:00:00 2001 From: Eric Hofesmann Date: Fri, 25 Oct 2019 16:42:38 -0400 Subject: [PATCH 55/55] Update requirements.txt Newer version of pillow --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5fa42ed..519b704 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ numpy==1.17.0 opencv-python==4.1.0.25 -Pillow==6.1.0 +pillow>=6.2.0 protobuf==3.9.0 PyYAML==5.1.1 scipy==1.3.0