From 3a1eaeb68f95d2cad73d09d057304f0da3e9b130 Mon Sep 17 00:00:00 2001 From: xusu Date: Wed, 28 Oct 2020 18:49:33 +0800 Subject: [PATCH 01/12] [Feature] Add motion decoder. --- mmaction/datasets/pipelines/loading.py | 54 ++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index cc27ed9315..e0197860ce 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -814,6 +814,60 @@ def __repr__(self): return repr_str +class FFmpegDecodeMotionVector(object): + """ + Ref: + https://github.com/chaoyuaw/pytorch-coviar. + """ + + def __init__(self, residual=False, gop_size=16): + self.residual = residual + self.gop_size = gop_size + + def _clip_and_scale(self, img, size): + return (img * (127.5 / size) + 128).astype(np.int32) + + def __call__(self, results): + try: + import coviar.load + except ImportError: + print('Please install coviar first.') + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + frame_inds = results['frame_inds'] + video_path = results['filename'] + motion_vectors = list() + for clip_idx in range(len(frame_inds)): + clip_mvs = list() + for frame_idx in frame_inds[clip_idx]: + gop_idx = frame_idx // self.gop_size + gop_pos = frame_idx % self.gop_size + if gop_pos == 0: + gop_idx = -1 + gop_pos = self.gop_size + mv = coviar.load(video_path, gop_idx, gop_pos, + 2 if self.residual else 1, False) + + if mv is None: + mv = np.zeros((256, 256, 2)) + else: + # FIXME: 20 is the scale of mv, should be tested, + # but it's ok as long as it is a linearly proportional + mv = self._clip_and_scale(mv, 20) + mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8) + clip_mvs.append(mv) + motion_vectors.append(clip_mvs) + + results['motion_vectors'] = motion_vectors + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(residual={self.residual})' + return repr_str + + @PIPELINES.register_module() class DecordInit: """Using decord to initialize the video_reader. From aabf9f889d6053a10bf95e8c56700338a3d92b89 Mon Sep 17 00:00:00 2001 From: xusu Date: Thu, 29 Oct 2020 19:01:59 +0800 Subject: [PATCH 02/12] Fix the bad double loop with numpy broadcast and slice. --- .../tsn_r50_1x1x3_75e_ucf101_motion_vector.py | 114 +++++++++++++++++ mmaction/datasets/pipelines/__init__.py | 4 +- mmaction/datasets/pipelines/loading.py | 120 ++++++++++++++++-- tests/test_data/test_loading.py | 16 ++- 4 files changed, 238 insertions(+), 16 deletions(-) create mode 100644 configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py diff --git a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py new file mode 100644 index 0000000000..9edc45bb2b --- /dev/null +++ b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py @@ -0,0 +1,114 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + in_channels=2, + depth=18, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=101, + in_channels=512, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.001)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/ucf101/rawframes/' +data_root_val = 'data/ucf101/rawframes/' +split = 1 # official train/test splits. valid numbers: 1, 2, 3 +ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt' +ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt' +ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='PyAVInit'), + dict(type='PyAVDecodeMotionVector'), + dict(type='Resize', scale=(-1, 16)), + dict(type='CenterCrop', crop_size=16), + dict(type='Flip', flip_ratio=0.5), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=32, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.00128, momentum=0.9, + weight_decay=0.0005) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[]) +total_epochs = 75 +checkpoint_config = dict(interval=5) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = f'./work_dirs/tsn_r50_1x1x3_75e_ucf101_split_{split}_rgb/' +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py index 9b58a404a1..bab313b05e 100644 --- a/mmaction/datasets/pipelines/__init__.py +++ b/mmaction/datasets/pipelines/__init__.py @@ -15,7 +15,7 @@ LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature, LoadProposals, OpenCVDecode, OpenCVInit, PyAVDecode, PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, - SampleProposalFrames, UntrimmedSampleFrames) + SampleProposalFrames, UntrimmedSampleFrames, PyAVDecodeMotionVector) __all__ = [ 'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames', @@ -31,5 +31,5 @@ 'FormatAudioShape', 'LoadAudioFeature', 'AudioFeatureSelector', 'AudioDecodeInit', 'EntityBoxPad', 'EntityBoxFlip', 'EntityBoxCrop', 'EntityBoxRescale', 'EntityBoxClip', 'RandomScale', 'ImageDecode', - 'BuildPseudoClip', 'RandomRescale' + 'BuildPseudoClip', 'RandomRescale', 'PyAVDecodeMotionVector' ] diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index e0197860ce..8fe7d55be9 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -756,7 +756,7 @@ def __repr__(self): @PIPELINES.register_module() -class PyAVDecode: +class PyAVDecodeMotionVector(object): """Using pyav to decode the video. PyAV: https://github.com/mikeboers/PyAV @@ -772,6 +772,26 @@ class PyAVDecode: def __init__(self, multi_thread=False): self.multi_thread = multi_thread + def parse_vectors(self, mv, vectors, height, width): + """Parse the returned vector.""" + (w, h, src_x, src_y, dst_x, + dst_y) = (vectors['w'], vectors['h'], vectors['src_x'], + vectors['src_y'], vectors['dst_x'], vectors['dst_y']) + val_x = dst_x - src_x + val_y = dst_y - src_y + start_x = (-1 * w / 2).astype(np.int8) + dst_x + start_y = (-1 * h / 2).astype(np.int8) + dst_y + end_x = start_x + w.astype(np.int8) + end_y = start_y + h.astype(np.int8) + for row in range(len(vectors)): + if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0 + and end_y[row] < height): + mv[start_y[row]:end_y[row], start_x[row]:end_x[row], + 0] = val_x[row] + mv[start_y[row]:end_y[row], start_x[row]:end_x[row], + 1] = val_y[row] + return mv + def __call__(self, results): """Perform the PyAV decoding. @@ -790,22 +810,38 @@ def __call__(self, results): # set max indice to make early stop max_inds = max(results['frame_inds']) i = 0 - for frame in container.decode(video=0): - if i > max_inds + 1: - break - imgs.append(frame.to_rgb().to_ndarray()) - i += 1 + stream = container.streams.video[0] + codec_context = stream.codec_context + codec_context.options = {'flags2': '+export_mvs'} + # import imageio + for packet in container.demux(stream): + for frame in packet.decode(): + if i > max_inds + 1: + break + i += 1 + height = frame.height + width = frame.width + mv = np.zeros((height, width, 2), dtype=np.int8) + vectors = frame.side_data.get('MOTION_VECTORS') + if frame.key_frame: + # Key frame don't have motion vectors + assert vectors is None + else: + assert len(vectors) > 0 + mv = self.parse_vectors(mv, vectors.to_ndarray(), height, + width) + # imageio.imwrite(f'tmp/gray/{i}.jpg', + # np.abs(mv[:, :, 0]) + np.abs(mv[:, :, 1])) + imgs.append(mv) results['video_reader'] = None del container # the available frame in pyav may be less than its length, # which may raise error - results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']] - - results['original_shape'] = imgs[0].shape[:2] - results['img_shape'] = imgs[0].shape[:2] - + results['mvs'] = np.array( + [imgs[i % len(imgs)] for i in results['frame_inds']]) + print(np.max(results['mvs'])) return results def __repr__(self): @@ -814,6 +850,7 @@ def __repr__(self): return repr_str +@PIPELINES.register_module() class FFmpegDecodeMotionVector(object): """ Ref: @@ -858,13 +895,70 @@ def __call__(self, results): clip_mvs.append(mv) motion_vectors.append(clip_mvs) - results['motion_vectors'] = motion_vectors + results['motion_vectors'] = np.array(motion_vectors) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(residual={self.residual}' + repr_str += f'gop_size={self.gop_size})' + return repr_str + + +class PyAVDecode(object): + """Using pyav to decode the video. + + PyAV: https://github.com/mikeboers/PyAV + Required keys are "video_reader" and "frame_inds", + added or modified keys are "imgs", "img_shape" and "original_shape". + Args: + multi_thread (bool): If set to True, it will apply multi + thread processing. Default: False. + """ + + def __init__(self, multi_thread=False): + self.multi_thread = multi_thread + + def __call__(self, results): + """Perform the PyAV decoding. + + Args: + results (dict): The resulting dict to be modified and passed + to the next transform in pipeline. + """ + container = results['video_reader'] + imgs = list() + + if self.multi_thread: + container.streams.video[0].thread_type = 'AUTO' + if results['frame_inds'].ndim != 1: + results['frame_inds'] = np.squeeze(results['frame_inds']) + + # set max indice to make early stop + max_inds = max(results['frame_inds']) + i = 0 + for frame in container.decode(video=0): + if i > max_inds + 1: + break + imgs.append(frame.to_rgb().to_ndarray()) + i += 1 + + results['video_reader'] = None + del container + + # the available frame in pyav may be less than its length, + # which may raise error + results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']] + + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] return results def __repr__(self): repr_str = self.__class__.__name__ - repr_str += f'(residual={self.residual})' + repr_str += f'(multi_thread={self.multi_thread})' return repr_str diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py index 03b43d6249..8284bd15f1 100644 --- a/tests/test_data/test_loading.py +++ b/tests/test_data/test_loading.py @@ -19,7 +19,7 @@ OpenCVInit, PyAVDecode, PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, SampleProposalFrames, - UntrimmedSampleFrames) + UntrimmedSampleFrames, PyAVDecodeMotionVector) # yapf: enable @@ -1466,3 +1466,17 @@ def test_audio_feature_selector(self): assert repr(audio_feature_selector) == ( f'{audio_feature_selector.__class__.__name__}(' f'fix_length={128})') + + def test_pyav_decode_motion_vector(self): + pyav_init = PyAVInit() + pyav = PyAVDecodeMotionVector() + + # test pyav + results = { + 'filename': + osp.join(osp.dirname(osp.dirname(__file__)), 'data/test.mp4'), + 'frame_inds': + np.arange(0, 32, 1)[:, np.newaxis] + } + results = pyav_init(results) + results = pyav(results) From 247406d5fc6641b100e907619b159585406d82bd Mon Sep 17 00:00:00 2001 From: xusu Date: Wed, 18 Nov 2020 15:59:49 +0800 Subject: [PATCH 03/12] Remove FFMpegDecodeMotionVector and fix review issues. Minor. --- ...50_1x1x3_75e_kinetics400_motion_vector.py} | 63 +++--- mmaction/datasets/pipelines/loading.py | 179 ++++++------------ tests/test_data/test_loading.py | 11 +- 3 files changed, 89 insertions(+), 164 deletions(-) rename configs/recognition_motion_vector/{tsn_r50_1x1x3_75e_ucf101_motion_vector.py => tsn_r50_1x1x3_75e_kinetics400_motion_vector.py} (65%) diff --git a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py similarity index 65% rename from configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py rename to configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py index 9edc45bb2b..ab9ea9fa45 100644 --- a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py +++ b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py @@ -19,19 +19,20 @@ train_cfg = None test_cfg = dict(average_clips=None) # dataset settings -dataset_type = 'RawframeDataset' -data_root = 'data/ucf101/rawframes/' -data_root_val = 'data/ucf101/rawframes/' -split = 1 # official train/test splits. valid numbers: 1, 2, 3 -ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt' -ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt' -ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt' +dataset_type = 'VideoDataset' +data_root = 'data/kinetics400/videos_fast/' +data_root_val = 'data/kinetics400/videos_fast/' +ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='PyAVInit'), - dict(type='PyAVDecodeMotionVector'), - dict(type='Resize', scale=(-1, 16)), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='PyAVDecodeMotionVector', multi_thread=True), + # dict(type='PyAVDecode'), + # dict(type='Resize', scale=(-1, 16)), dict(type='CenterCrop', crop_size=16), dict(type='Flip', flip_ratio=0.5), dict(type='FormatShape', input_format='NCHW'), @@ -39,40 +40,32 @@ dict(type='ToTensor', keys=['imgs', 'label']) ] val_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=3, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='CenterCrop', crop_size=256), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), + dict(type='PyAVInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='PyAVDecodeMotionVector'), + # dict(type='PyAVDecode'), + # dict(type='Resize', scale=(-1, 16)), + dict(type='CenterCrop', crop_size=16), + dict(type='Flip', flip_ratio=0.5), dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='Collect', keys=['imgs'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] test_pipeline = [ - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=25, - test_mode=True), - dict(type='RawFrameDecode'), - dict(type='Resize', scale=(-1, 256)), - dict(type='ThreeCrop', crop_size=256), - dict(type='Flip', flip_ratio=0), - dict(type='Normalize', **img_norm_cfg), + dict(type='PyAVInit'), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='PyAVDecodeMotionVector'), + # dict(type='PyAVDecode'), + # dict(type='Resize', scale=(-1, 16)), + dict(type='CenterCrop', crop_size=16), + dict(type='Flip', flip_ratio=0.5), dict(type='FormatShape', input_format='NCHW'), dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), dict(type='ToTensor', keys=['imgs']) ] data = dict( - videos_per_gpu=32, - workers_per_gpu=4, + videos_per_gpu=1, + workers_per_gpu=0, train=dict( type=dataset_type, ann_file=ann_file_train, @@ -108,7 +101,7 @@ # runtime settings dist_params = dict(backend='nccl') log_level = 'INFO' -work_dir = f'./work_dirs/tsn_r50_1x1x3_75e_ucf101_split_{split}_rgb/' +work_dir = 'word_dirs/tsn_r50_1x1x3_75e_kinetics400_motion_vector' load_from = None resume_from = None workflow = [('train', 1)] diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 8fe7d55be9..6e7912efca 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -756,14 +756,12 @@ def __repr__(self): @PIPELINES.register_module() -class PyAVDecodeMotionVector(object): +class PyAVDecode: """Using pyav to decode the video. PyAV: https://github.com/mikeboers/PyAV - Required keys are "video_reader" and "frame_inds", added or modified keys are "imgs", "img_shape" and "original_shape". - Args: multi_thread (bool): If set to True, it will apply multi thread processing. Default: False. @@ -772,26 +770,6 @@ class PyAVDecodeMotionVector(object): def __init__(self, multi_thread=False): self.multi_thread = multi_thread - def parse_vectors(self, mv, vectors, height, width): - """Parse the returned vector.""" - (w, h, src_x, src_y, dst_x, - dst_y) = (vectors['w'], vectors['h'], vectors['src_x'], - vectors['src_y'], vectors['dst_x'], vectors['dst_y']) - val_x = dst_x - src_x - val_y = dst_y - src_y - start_x = (-1 * w / 2).astype(np.int8) + dst_x - start_y = (-1 * h / 2).astype(np.int8) + dst_y - end_x = start_x + w.astype(np.int8) - end_y = start_y + h.astype(np.int8) - for row in range(len(vectors)): - if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0 - and end_y[row] < height): - mv[start_y[row]:end_y[row], start_x[row]:end_x[row], - 0] = val_x[row] - mv[start_y[row]:end_y[row], start_x[row]:end_x[row], - 1] = val_y[row] - return mv - def __call__(self, results): """Perform the PyAV decoding. @@ -810,115 +788,63 @@ def __call__(self, results): # set max indice to make early stop max_inds = max(results['frame_inds']) i = 0 - stream = container.streams.video[0] - codec_context = stream.codec_context - codec_context.options = {'flags2': '+export_mvs'} - # import imageio - for packet in container.demux(stream): - for frame in packet.decode(): - if i > max_inds + 1: - break - i += 1 - height = frame.height - width = frame.width - mv = np.zeros((height, width, 2), dtype=np.int8) - vectors = frame.side_data.get('MOTION_VECTORS') - if frame.key_frame: - # Key frame don't have motion vectors - assert vectors is None - else: - assert len(vectors) > 0 - mv = self.parse_vectors(mv, vectors.to_ndarray(), height, - width) - # imageio.imwrite(f'tmp/gray/{i}.jpg', - # np.abs(mv[:, :, 0]) + np.abs(mv[:, :, 1])) - imgs.append(mv) + for frame in container.decode(video=0): + if i > max_inds + 1: + break + imgs.append(frame.to_rgb().to_ndarray()) + i += 1 results['video_reader'] = None del container # the available frame in pyav may be less than its length, # which may raise error - results['mvs'] = np.array( - [imgs[i % len(imgs)] for i in results['frame_inds']]) - print(np.max(results['mvs'])) - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(multi_thread={self.multi_thread})' - return repr_str - - -@PIPELINES.register_module() -class FFmpegDecodeMotionVector(object): - """ - Ref: - https://github.com/chaoyuaw/pytorch-coviar. - """ - - def __init__(self, residual=False, gop_size=16): - self.residual = residual - self.gop_size = gop_size - - def _clip_and_scale(self, img, size): - return (img * (127.5 / size) + 128).astype(np.int32) + results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']] - def __call__(self, results): - try: - import coviar.load - except ImportError: - print('Please install coviar first.') - if results['frame_inds'].ndim != 1: - results['frame_inds'] = np.squeeze(results['frame_inds']) - frame_inds = results['frame_inds'] - video_path = results['filename'] - motion_vectors = list() - for clip_idx in range(len(frame_inds)): - clip_mvs = list() - for frame_idx in frame_inds[clip_idx]: - gop_idx = frame_idx // self.gop_size - gop_pos = frame_idx % self.gop_size - if gop_pos == 0: - gop_idx = -1 - gop_pos = self.gop_size - mv = coviar.load(video_path, gop_idx, gop_pos, - 2 if self.residual else 1, False) - - if mv is None: - mv = np.zeros((256, 256, 2)) - else: - # FIXME: 20 is the scale of mv, should be tested, - # but it's ok as long as it is a linearly proportional - mv = self._clip_and_scale(mv, 20) - mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8) - clip_mvs.append(mv) - motion_vectors.append(clip_mvs) - - results['motion_vectors'] = np.array(motion_vectors) + results['original_shape'] = imgs[0].shape[:2] + results['img_shape'] = imgs[0].shape[:2] return results def __repr__(self): repr_str = self.__class__.__name__ - repr_str += f'(residual={self.residual}' - repr_str += f'gop_size={self.gop_size})' + repr_str += f'(multi_thread={self.multi_thread})' return repr_str -class PyAVDecode(object): - """Using pyav to decode the video. +@PIPELINES.register_module() +class PyAVDecodeMotionVector(PyAVDecode): + """Using pyav to decode the motion vectors from video. PyAV: https://github.com/mikeboers/PyAV + Required keys are "video_reader" and "frame_inds", added or modified keys are "imgs", "img_shape" and "original_shape". + Args: multi_thread (bool): If set to True, it will apply multi thread processing. Default: False. """ - def __init__(self, multi_thread=False): - self.multi_thread = multi_thread + def _parse_vectors(self, mv, vectors, height, width): + """Parse the returned vector.""" + (w, h, src_x, src_y, dst_x, + dst_y) = (vectors['w'], vectors['h'], vectors['src_x'], + vectors['src_y'], vectors['dst_x'], vectors['dst_y']) + val_x = dst_x - src_x + val_y = dst_y - src_y + start_x = (-1 * w / 2).astype(np.int8) + dst_x + start_y = (-1 * h / 2).astype(np.int8) + dst_y + end_x = start_x + w.astype(np.int8) + end_y = start_y + h.astype(np.int8) + for row in range(len(vectors)): + if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0 + and end_y[row] < height): + mv[start_y[row]:end_y[row], start_x[row]:end_x[row], + 0] = val_x[row] + mv[start_y[row]:end_y[row], start_x[row]:end_x[row], + 1] = val_y[row] + return mv def __call__(self, results): """Perform the PyAV decoding. @@ -938,29 +864,36 @@ def __call__(self, results): # set max indice to make early stop max_inds = max(results['frame_inds']) i = 0 - for frame in container.decode(video=0): - if i > max_inds + 1: - break - imgs.append(frame.to_rgb().to_ndarray()) - i += 1 + stream = container.streams.video[0] + codec_context = stream.codec_context + codec_context.options = {'flags2': '+export_mvs'} + # import imageio + for packet in container.demux(stream): + for frame in packet.decode(): + if i > max_inds + 1: + break + i += 1 + height = frame.height + width = frame.width + mv = np.zeros((height, width, 2), dtype=np.int8) + vectors = frame.side_data.get('MOTION_VECTORS') + if frame.key_frame: + # Key frame don't have motion vectors + assert vectors is None + if vectors is not None and len(vectors) > 0: + mv = self._parse_vectors(mv, vectors.to_ndarray(), height, + width) + imgs.append(mv) results['video_reader'] = None del container # the available frame in pyav may be less than its length, # which may raise error - results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']] - - results['original_shape'] = imgs[0].shape[:2] - results['img_shape'] = imgs[0].shape[:2] - + results['imgs'] = np.array( + [imgs[i % len(imgs)] for i in results['frame_inds']]) return results - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(multi_thread={self.multi_thread})' - return repr_str - @PIPELINES.register_module() class DecordInit: diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py index 8284bd15f1..d1979997c8 100644 --- a/tests/test_data/test_loading.py +++ b/tests/test_data/test_loading.py @@ -16,10 +16,11 @@ LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature, LoadProposals, OpenCVDecode, - OpenCVInit, PyAVDecode, PyAVInit, + OpenCVInit, PyAVDecode, + PyAVDecodeMotionVector, PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, SampleProposalFrames, - UntrimmedSampleFrames, PyAVDecodeMotionVector) + UntrimmedSampleFrames) # yapf: enable @@ -1473,10 +1474,8 @@ def test_pyav_decode_motion_vector(self): # test pyav results = { - 'filename': - osp.join(osp.dirname(osp.dirname(__file__)), 'data/test.mp4'), - 'frame_inds': - np.arange(0, 32, 1)[:, np.newaxis] + 'filename': self.video_path, + 'frame_inds': np.arange(0, 32, 1)[:, None] } results = pyav_init(results) results = pyav(results) From ee0a76a467ee73d866c350780d2ab2c11c541e0b Mon Sep 17 00:00:00 2001 From: xusu Date: Wed, 18 Nov 2020 16:04:45 +0800 Subject: [PATCH 04/12] Minor. --- ...r50_1x1x3_75e_kinetics400_motion_vector.py | 107 ------------------ mmaction/datasets/pipelines/loading.py | 4 +- tests/test_data/test_loading.py | 2 + 3 files changed, 4 insertions(+), 109 deletions(-) delete mode 100644 configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py diff --git a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py deleted file mode 100644 index ab9ea9fa45..0000000000 --- a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py +++ /dev/null @@ -1,107 +0,0 @@ -# model settings -model = dict( - type='Recognizer2D', - backbone=dict( - type='ResNet', - pretrained='torchvision://resnet50', - in_channels=2, - depth=18, - norm_eval=False), - cls_head=dict( - type='TSNHead', - num_classes=101, - in_channels=512, - spatial_type='avg', - consensus=dict(type='AvgConsensus', dim=1), - dropout_ratio=0.4, - init_std=0.001)) -# model training and testing settings -train_cfg = None -test_cfg = dict(average_clips=None) -# dataset settings -dataset_type = 'VideoDataset' -data_root = 'data/kinetics400/videos_fast/' -data_root_val = 'data/kinetics400/videos_fast/' -ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt' -ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt' -ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt' -img_norm_cfg = dict( - mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) -train_pipeline = [ - dict(type='PyAVInit'), - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), - dict(type='PyAVDecodeMotionVector', multi_thread=True), - # dict(type='PyAVDecode'), - # dict(type='Resize', scale=(-1, 16)), - dict(type='CenterCrop', crop_size=16), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs', 'label']) -] -val_pipeline = [ - dict(type='PyAVInit'), - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), - dict(type='PyAVDecodeMotionVector'), - # dict(type='PyAVDecode'), - # dict(type='Resize', scale=(-1, 16)), - dict(type='CenterCrop', crop_size=16), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -test_pipeline = [ - dict(type='PyAVInit'), - dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), - dict(type='PyAVDecodeMotionVector'), - # dict(type='PyAVDecode'), - # dict(type='Resize', scale=(-1, 16)), - dict(type='CenterCrop', crop_size=16), - dict(type='Flip', flip_ratio=0.5), - dict(type='FormatShape', input_format='NCHW'), - dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), - dict(type='ToTensor', keys=['imgs']) -] -data = dict( - videos_per_gpu=1, - workers_per_gpu=0, - train=dict( - type=dataset_type, - ann_file=ann_file_train, - data_prefix=data_root, - pipeline=train_pipeline), - val=dict( - type=dataset_type, - ann_file=ann_file_val, - data_prefix=data_root_val, - pipeline=val_pipeline), - test=dict( - type=dataset_type, - ann_file=ann_file_test, - data_prefix=data_root_val, - pipeline=test_pipeline)) -# optimizer -optimizer = dict( - type='SGD', lr=0.00128, momentum=0.9, - weight_decay=0.0005) # this lr is used for 8 gpus -optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) -# learning policy -lr_config = dict(policy='step', step=[]) -total_epochs = 75 -checkpoint_config = dict(interval=5) -evaluation = dict( - interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) -log_config = dict( - interval=20, - hooks=[ - dict(type='TextLoggerHook'), - # dict(type='TensorboardLoggerHook'), - ]) -# runtime settings -dist_params = dict(backend='nccl') -log_level = 'INFO' -work_dir = 'word_dirs/tsn_r50_1x1x3_75e_kinetics400_motion_vector' -load_from = None -resume_from = None -workflow = [('train', 1)] diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 6e7912efca..8133db41e4 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -819,7 +819,7 @@ class PyAVDecodeMotionVector(PyAVDecode): PyAV: https://github.com/mikeboers/PyAV Required keys are "video_reader" and "frame_inds", - added or modified keys are "imgs", "img_shape" and "original_shape". + added or modified keys are "mvs". Args: multi_thread (bool): If set to True, it will apply multi @@ -890,7 +890,7 @@ def __call__(self, results): # the available frame in pyav may be less than its length, # which may raise error - results['imgs'] = np.array( + results['mvs'] = np.array( [imgs[i % len(imgs)] for i in results['frame_inds']]) return results diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py index d1979997c8..c69c20235f 100644 --- a/tests/test_data/test_loading.py +++ b/tests/test_data/test_loading.py @@ -1479,3 +1479,5 @@ def test_pyav_decode_motion_vector(self): } results = pyav_init(results) results = pyav(results) + target_keys = ['mvs'] + assert self.check_keys_contain(results.keys(), target_keys) From f8ed428e13aac7e4430a667f6e679229626219b2 Mon Sep 17 00:00:00 2001 From: xusu Date: Wed, 18 Nov 2020 16:07:56 +0800 Subject: [PATCH 05/12] Minor fix. --- mmaction/datasets/pipelines/__init__.py | 5 +++-- mmaction/datasets/pipelines/loading.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py index bab313b05e..ae3cd1a257 100644 --- a/mmaction/datasets/pipelines/__init__.py +++ b/mmaction/datasets/pipelines/__init__.py @@ -14,8 +14,9 @@ GenerateLocalizationLabels, ImageDecode, LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature, LoadProposals, OpenCVDecode, OpenCVInit, PyAVDecode, - PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames, - SampleProposalFrames, UntrimmedSampleFrames, PyAVDecodeMotionVector) + PyAVDecodeMotionVector, PyAVInit, RawFrameDecode, + SampleAVAFrames, SampleFrames, SampleProposalFrames, + UntrimmedSampleFrames) __all__ = [ 'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames', diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 8133db41e4..ed43c7f5fc 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -760,8 +760,10 @@ class PyAVDecode: """Using pyav to decode the video. PyAV: https://github.com/mikeboers/PyAV + Required keys are "video_reader" and "frame_inds", added or modified keys are "imgs", "img_shape" and "original_shape". + Args: multi_thread (bool): If set to True, it will apply multi thread processing. Default: False. From bb468321c80d3c47da98ebdffade5dbd99da8991 Mon Sep 17 00:00:00 2001 From: xusu Date: Tue, 24 Nov 2020 17:05:04 +0800 Subject: [PATCH 06/12] Minor fix and change changelog. --- docs/changelog.md | 2 +- mmaction/datasets/pipelines/loading.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 1db2de4b26..1ab50763e4 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -9,7 +9,7 @@ - Support GYM99 data preparation ([#331](https://github.com/open-mmlab/mmaction2/pull/331)) - Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324)) - Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345)) - +- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291)) **Improvements** - Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312)) - Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274)) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index ed43c7f5fc..416ab66b31 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -842,10 +842,9 @@ def _parse_vectors(self, mv, vectors, height, width): for row in range(len(vectors)): if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0 and end_y[row] < height): - mv[start_y[row]:end_y[row], start_x[row]:end_x[row], - 0] = val_x[row] - mv[start_y[row]:end_y[row], start_x[row]:end_x[row], - 1] = val_y[row] + mv[start_y[row]:end_y[row], + start_x[row]:end_x[row]] = (val_x[row], val_y[row]) + return mv def __call__(self, results): From d8fb864f356100d03706b8f7e2ef166eec927d73 Mon Sep 17 00:00:00 2001 From: xusu Date: Tue, 24 Nov 2020 17:21:34 +0800 Subject: [PATCH 07/12] Minor fix for pyint. --- mmaction/datasets/pipelines/loading.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 416ab66b31..23b781f4d0 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -829,7 +829,7 @@ class PyAVDecodeMotionVector(PyAVDecode): """ def _parse_vectors(self, mv, vectors, height, width): - """Parse the returned vector.""" + """Parse the returned vectors.""" (w, h, src_x, src_y, dst_x, dst_y) = (vectors['w'], vectors['h'], vectors['src_x'], vectors['src_y'], vectors['dst_x'], vectors['dst_y']) @@ -839,7 +839,7 @@ def _parse_vectors(self, mv, vectors, height, width): start_y = (-1 * h / 2).astype(np.int8) + dst_y end_x = start_x + w.astype(np.int8) end_y = start_y + h.astype(np.int8) - for row in range(len(vectors)): + for row, _ in enumerate(vectors): if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0 and end_y[row] < height): mv[start_y[row]:end_y[row], @@ -848,7 +848,7 @@ def _parse_vectors(self, mv, vectors, height, width): return mv def __call__(self, results): - """Perform the PyAV decoding. + """Perform the PyAV motion vector decoding. Args: results (dict): The resulting dict to be modified and passed From 1ea8e76c9a43118dc625f11951b625a5f55811f6 Mon Sep 17 00:00:00 2001 From: xusu Date: Sat, 28 Nov 2020 21:14:25 +0800 Subject: [PATCH 08/12] Rename the field mvs. Minor fix changelog. --- docs/changelog.md | 1 + mmaction/datasets/pipelines/loading.py | 4 ++-- tests/test_data/test_loading.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 1ab50763e4..565c24e0a8 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -10,6 +10,7 @@ - Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324)) - Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345)) - Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291)) + **Improvements** - Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312)) - Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274)) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 23b781f4d0..0deac1f60b 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -821,7 +821,7 @@ class PyAVDecodeMotionVector(PyAVDecode): PyAV: https://github.com/mikeboers/PyAV Required keys are "video_reader" and "frame_inds", - added or modified keys are "mvs". + added or modified keys are "motion_vectors". Args: multi_thread (bool): If set to True, it will apply multi @@ -891,7 +891,7 @@ def __call__(self, results): # the available frame in pyav may be less than its length, # which may raise error - results['mvs'] = np.array( + results['motion_vectors'] = np.array( [imgs[i % len(imgs)] for i in results['frame_inds']]) return results diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py index c69c20235f..f6d98bcc23 100644 --- a/tests/test_data/test_loading.py +++ b/tests/test_data/test_loading.py @@ -1479,5 +1479,5 @@ def test_pyav_decode_motion_vector(self): } results = pyav_init(results) results = pyav(results) - target_keys = ['mvs'] + target_keys = ['motion_vectors'] assert self.check_keys_contain(results.keys(), target_keys) From 1c4a7bf9066a2aa0c3d60bbd4c37e204cd9b3c55 Mon Sep 17 00:00:00 2001 From: xusu Date: Sat, 28 Nov 2020 21:25:30 +0800 Subject: [PATCH 09/12] More unittest. --- mmaction/datasets/pipelines/loading.py | 1 - tests/test_data/test_loading.py | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 0deac1f60b..639fd6d6e8 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -868,7 +868,6 @@ def __call__(self, results): stream = container.streams.video[0] codec_context = stream.codec_context codec_context.options = {'flags2': '+export_mvs'} - # import imageio for packet in container.demux(stream): for frame in packet.decode(): if i > max_inds + 1: diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py index f6d98bcc23..2b58bb4c57 100644 --- a/tests/test_data/test_loading.py +++ b/tests/test_data/test_loading.py @@ -1472,12 +1472,24 @@ def test_pyav_decode_motion_vector(self): pyav_init = PyAVInit() pyav = PyAVDecodeMotionVector() - # test pyav + # test pyav with 2-dim input results = { 'filename': self.video_path, - 'frame_inds': np.arange(0, 32, 1)[:, None] + 'frame_inds': np.arange(0, 32, 1)[:, np.newaxis] } results = pyav_init(results) results = pyav(results) target_keys = ['motion_vectors'] assert self.check_keys_contain(results.keys(), target_keys) + + # test pyav with 1 dim input + results = { + 'filename': self.video_path, + 'frame_inds': np.arange(0, 32, 1) + } + pyav_init = PyAVInit() + results = pyav_init(results) + pyav = PyAVDecodeMotionVector() + results = pyav(results) + + assert self.check_keys_contain(results.keys(), target_keys) From 00fd89c65d9bf025c395df7d62872341c39e2123 Mon Sep 17 00:00:00 2001 From: xusu Date: Mon, 30 Nov 2020 20:24:23 +0800 Subject: [PATCH 10/12] Revised according to review. Fix typo. --- mmaction/datasets/pipelines/loading.py | 28 +++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 639fd6d6e8..5681b81c77 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -818,10 +818,11 @@ def __repr__(self): class PyAVDecodeMotionVector(PyAVDecode): """Using pyav to decode the motion vectors from video. - PyAV: https://github.com/mikeboers/PyAV + Reference: https://github.com/PyAV-Org/PyAV/ + blob/main/tests/test_decode.py Required keys are "video_reader" and "frame_inds", - added or modified keys are "motion_vectors". + added or modified keys are "motion_vectors", "frame_inds". Args: multi_thread (bool): If set to True, it will apply multi @@ -835,15 +836,14 @@ def _parse_vectors(self, mv, vectors, height, width): vectors['src_y'], vectors['dst_x'], vectors['dst_y']) val_x = dst_x - src_x val_y = dst_y - src_y - start_x = (-1 * w / 2).astype(np.int8) + dst_x - start_y = (-1 * h / 2).astype(np.int8) + dst_y - end_x = start_x + w.astype(np.int8) - end_y = start_y + h.astype(np.int8) - for row, _ in enumerate(vectors): - if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0 - and end_y[row] < height): - mv[start_y[row]:end_y[row], - start_x[row]:end_x[row]] = (val_x[row], val_y[row]) + start_x = -1 * w // 2 + dst_x + start_y = -1 * h // 2 + dst_y + end_x = start_x + w + end_y = start_y + h + for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y, + val_x, val_y): + if (sx >= 0 and ex < width and sy >= 0 and ey < height): + mv[sy:ey, sx:ex] = (vx, vy) return mv @@ -862,15 +862,15 @@ def __call__(self, results): if results['frame_inds'].ndim != 1: results['frame_inds'] = np.squeeze(results['frame_inds']) - # set max indice to make early stop - max_inds = max(results['frame_inds']) + # set max index to make early stop + max_idx = max(results['frame_inds']) i = 0 stream = container.streams.video[0] codec_context = stream.codec_context codec_context.options = {'flags2': '+export_mvs'} for packet in container.demux(stream): for frame in packet.decode(): - if i > max_inds + 1: + if i > max_idx + 1: break i += 1 height = frame.height From 2eead76c8d46cad1370f12ccfad369c0c1a2e5f7 Mon Sep 17 00:00:00 2001 From: xusu Date: Mon, 30 Nov 2020 21:03:34 +0800 Subject: [PATCH 11/12] Minor fix. --- mmaction/datasets/pipelines/loading.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 5681b81c77..15e88e1bd6 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -836,8 +836,8 @@ def _parse_vectors(self, mv, vectors, height, width): vectors['src_y'], vectors['dst_x'], vectors['dst_y']) val_x = dst_x - src_x val_y = dst_y - src_y - start_x = -1 * w // 2 + dst_x - start_y = -1 * h // 2 + dst_y + start_x = dst_x - w // 2 + start_y = dst_y - h // 2 end_x = start_x + w end_y = start_y + h for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y, From f92ca9245b7074d47d744d996a4682bf4b3cda51 Mon Sep 17 00:00:00 2001 From: xusu Date: Mon, 30 Nov 2020 21:20:59 +0800 Subject: [PATCH 12/12] parse vectors to staticmethod.. fix. --- mmaction/datasets/pipelines/loading.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 15e88e1bd6..631aec3d24 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -829,7 +829,8 @@ class PyAVDecodeMotionVector(PyAVDecode): thread processing. Default: False. """ - def _parse_vectors(self, mv, vectors, height, width): + @staticmethod + def _parse_vectors(mv, vectors, height, width): """Parse the returned vectors.""" (w, h, src_x, src_y, dst_x, dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],