From 3a1eaeb68f95d2cad73d09d057304f0da3e9b130 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Wed, 28 Oct 2020 18:49:33 +0800
Subject: [PATCH 01/12] [Feature] Add motion decoder.

---
 mmaction/datasets/pipelines/loading.py | 54 ++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index cc27ed9315..e0197860ce 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -814,6 +814,60 @@ def __repr__(self):
         return repr_str
 
 
+class FFmpegDecodeMotionVector(object):
+    """
+        Ref:
+            https://github.com/chaoyuaw/pytorch-coviar.
+    """
+
+    def __init__(self, residual=False, gop_size=16):
+        self.residual = residual
+        self.gop_size = gop_size
+
+    def _clip_and_scale(self, img, size):
+        return (img * (127.5 / size) + 128).astype(np.int32)
+
+    def __call__(self, results):
+        try:
+            import coviar.load
+        except ImportError:
+            print('Please install coviar first.')
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+        frame_inds = results['frame_inds']
+        video_path = results['filename']
+        motion_vectors = list()
+        for clip_idx in range(len(frame_inds)):
+            clip_mvs = list()
+            for frame_idx in frame_inds[clip_idx]:
+                gop_idx = frame_idx // self.gop_size
+                gop_pos = frame_idx % self.gop_size
+                if gop_pos == 0:
+                    gop_idx = -1
+                    gop_pos = self.gop_size
+                mv = coviar.load(video_path, gop_idx, gop_pos,
+                                 2 if self.residual else 1, False)
+
+                if mv is None:
+                    mv = np.zeros((256, 256, 2))
+                else:
+                    # FIXME: 20 is the scale of mv, should be tested,
+                    # but it's ok as long as it is a linearly proportional
+                    mv = self._clip_and_scale(mv, 20)
+                    mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8)
+                clip_mvs.append(mv)
+            motion_vectors.append(clip_mvs)
+
+        results['motion_vectors'] = motion_vectors
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(residual={self.residual})'
+        return repr_str
+
+
 @PIPELINES.register_module()
 class DecordInit:
     """Using decord to initialize the video_reader.

From aabf9f889d6053a10bf95e8c56700338a3d92b89 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Thu, 29 Oct 2020 19:01:59 +0800
Subject: [PATCH 02/12] Fix the bad double loop with numpy broadcast and slice.

---
 .../tsn_r50_1x1x3_75e_ucf101_motion_vector.py | 114 +++++++++++++++++
 mmaction/datasets/pipelines/__init__.py       |   4 +-
 mmaction/datasets/pipelines/loading.py        | 120 ++++++++++++++++--
 tests/test_data/test_loading.py               |  16 ++-
 4 files changed, 238 insertions(+), 16 deletions(-)
 create mode 100644 configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py

diff --git a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py
new file mode 100644
index 0000000000..9edc45bb2b
--- /dev/null
+++ b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py
@@ -0,0 +1,114 @@
+# model settings
+model = dict(
+    type='Recognizer2D',
+    backbone=dict(
+        type='ResNet',
+        pretrained='torchvision://resnet50',
+        in_channels=2,
+        depth=18,
+        norm_eval=False),
+    cls_head=dict(
+        type='TSNHead',
+        num_classes=101,
+        in_channels=512,
+        spatial_type='avg',
+        consensus=dict(type='AvgConsensus', dim=1),
+        dropout_ratio=0.4,
+        init_std=0.001))
+# model training and testing settings
+train_cfg = None
+test_cfg = dict(average_clips=None)
+# dataset settings
+dataset_type = 'RawframeDataset'
+data_root = 'data/ucf101/rawframes/'
+data_root_val = 'data/ucf101/rawframes/'
+split = 1  # official train/test splits. valid numbers: 1, 2, 3
+ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
+ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
+train_pipeline = [
+    dict(type='PyAVInit'),
+    dict(type='PyAVDecodeMotionVector'),
+    dict(type='Resize', scale=(-1, 16)),
+    dict(type='CenterCrop', crop_size=16),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs', 'label'])
+]
+val_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=3,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+test_pipeline = [
+    dict(
+        type='SampleFrames',
+        clip_len=1,
+        frame_interval=1,
+        num_clips=25,
+        test_mode=True),
+    dict(type='RawFrameDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='Flip', flip_ratio=0),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='FormatShape', input_format='NCHW'),
+    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='ToTensor', keys=['imgs'])
+]
+data = dict(
+    videos_per_gpu=32,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file_train,
+        data_prefix=data_root,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file_val,
+        data_prefix=data_root_val,
+        pipeline=val_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file_test,
+        data_prefix=data_root_val,
+        pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='SGD', lr=0.00128, momentum=0.9,
+    weight_decay=0.0005)  # this lr is used for 8 gpus
+optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
+# learning policy
+lr_config = dict(policy='step', step=[])
+total_epochs = 75
+checkpoint_config = dict(interval=5)
+evaluation = dict(
+    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
+log_config = dict(
+    interval=20,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+    ])
+# runtime settings
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = f'./work_dirs/tsn_r50_1x1x3_75e_ucf101_split_{split}_rgb/'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py
index 9b58a404a1..bab313b05e 100644
--- a/mmaction/datasets/pipelines/__init__.py
+++ b/mmaction/datasets/pipelines/__init__.py
@@ -15,7 +15,7 @@
                       LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature,
                       LoadProposals, OpenCVDecode, OpenCVInit, PyAVDecode,
                       PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
-                      SampleProposalFrames, UntrimmedSampleFrames)
+                      SampleProposalFrames, UntrimmedSampleFrames, PyAVDecodeMotionVector)
 
 __all__ = [
     'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames',
@@ -31,5 +31,5 @@
     'FormatAudioShape', 'LoadAudioFeature', 'AudioFeatureSelector',
     'AudioDecodeInit', 'EntityBoxPad', 'EntityBoxFlip', 'EntityBoxCrop',
     'EntityBoxRescale', 'EntityBoxClip', 'RandomScale', 'ImageDecode',
-    'BuildPseudoClip', 'RandomRescale'
+    'BuildPseudoClip', 'RandomRescale', 'PyAVDecodeMotionVector'
 ]
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index e0197860ce..8fe7d55be9 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -756,7 +756,7 @@ def __repr__(self):
 
 
 @PIPELINES.register_module()
-class PyAVDecode:
+class PyAVDecodeMotionVector(object):
     """Using pyav to decode the video.
 
     PyAV: https://github.com/mikeboers/PyAV
@@ -772,6 +772,26 @@ class PyAVDecode:
     def __init__(self, multi_thread=False):
         self.multi_thread = multi_thread
 
+    def parse_vectors(self, mv, vectors, height, width):
+        """Parse the returned vector."""
+        (w, h, src_x, src_y, dst_x,
+         dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],
+                   vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
+        val_x = dst_x - src_x
+        val_y = dst_y - src_y
+        start_x = (-1 * w / 2).astype(np.int8) + dst_x
+        start_y = (-1 * h / 2).astype(np.int8) + dst_y
+        end_x = start_x + w.astype(np.int8)
+        end_y = start_y + h.astype(np.int8)
+        for row in range(len(vectors)):
+            if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0
+                    and end_y[row] < height):
+                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
+                   0] = val_x[row]
+                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
+                   1] = val_y[row]
+        return mv
+
     def __call__(self, results):
         """Perform the PyAV decoding.
 
@@ -790,22 +810,38 @@ def __call__(self, results):
         # set max indice to make early stop
         max_inds = max(results['frame_inds'])
         i = 0
-        for frame in container.decode(video=0):
-            if i > max_inds + 1:
-                break
-            imgs.append(frame.to_rgb().to_ndarray())
-            i += 1
+        stream = container.streams.video[0]
+        codec_context = stream.codec_context
+        codec_context.options = {'flags2': '+export_mvs'}
+        # import imageio
+        for packet in container.demux(stream):
+            for frame in packet.decode():
+                if i > max_inds + 1:
+                    break
+                i += 1
+                height = frame.height
+                width = frame.width
+                mv = np.zeros((height, width, 2), dtype=np.int8)
+                vectors = frame.side_data.get('MOTION_VECTORS')
+                if frame.key_frame:
+                    # Key frame don't have motion vectors
+                    assert vectors is None
+                else:
+                    assert len(vectors) > 0
+                    mv = self.parse_vectors(mv, vectors.to_ndarray(), height,
+                                            width)
+                # imageio.imwrite(f'tmp/gray/{i}.jpg',
+                # np.abs(mv[:, :, 0]) + np.abs(mv[:, :, 1]))
+                imgs.append(mv)
 
         results['video_reader'] = None
         del container
 
         # the available frame in pyav may be less than its length,
         # which may raise error
-        results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']]
-
-        results['original_shape'] = imgs[0].shape[:2]
-        results['img_shape'] = imgs[0].shape[:2]
-
+        results['mvs'] = np.array(
+            [imgs[i % len(imgs)] for i in results['frame_inds']])
+        print(np.max(results['mvs']))
         return results
 
     def __repr__(self):
@@ -814,6 +850,7 @@ def __repr__(self):
         return repr_str
 
 
+@PIPELINES.register_module()
 class FFmpegDecodeMotionVector(object):
     """
         Ref:
@@ -858,13 +895,70 @@ def __call__(self, results):
                 clip_mvs.append(mv)
             motion_vectors.append(clip_mvs)
 
-        results['motion_vectors'] = motion_vectors
+        results['motion_vectors'] = np.array(motion_vectors)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(residual={self.residual}'
+        repr_str += f'gop_size={self.gop_size})'
+        return repr_str
+
+
+class PyAVDecode(object):
+    """Using pyav to decode the video.
+
+    PyAV: https://github.com/mikeboers/PyAV
+    Required keys are "video_reader" and "frame_inds",
+    added or modified keys are "imgs", "img_shape" and "original_shape".
+    Args:
+        multi_thread (bool): If set to True, it will apply multi
+            thread processing. Default: False.
+    """
+
+    def __init__(self, multi_thread=False):
+        self.multi_thread = multi_thread
+
+    def __call__(self, results):
+        """Perform the PyAV decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        container = results['video_reader']
+        imgs = list()
+
+        if self.multi_thread:
+            container.streams.video[0].thread_type = 'AUTO'
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        # set max indice to make early stop
+        max_inds = max(results['frame_inds'])
+        i = 0
+        for frame in container.decode(video=0):
+            if i > max_inds + 1:
+                break
+            imgs.append(frame.to_rgb().to_ndarray())
+            i += 1
+
+        results['video_reader'] = None
+        del container
+
+        # the available frame in pyav may be less than its length,
+        # which may raise error
+        results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']]
+
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
 
         return results
 
     def __repr__(self):
         repr_str = self.__class__.__name__
-        repr_str += f'(residual={self.residual})'
+        repr_str += f'(multi_thread={self.multi_thread})'
         return repr_str
 
 
diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py
index 03b43d6249..8284bd15f1 100644
--- a/tests/test_data/test_loading.py
+++ b/tests/test_data/test_loading.py
@@ -19,7 +19,7 @@
                                          OpenCVInit, PyAVDecode, PyAVInit,
                                          RawFrameDecode, SampleAVAFrames,
                                          SampleFrames, SampleProposalFrames,
-                                         UntrimmedSampleFrames)
+                                         UntrimmedSampleFrames, PyAVDecodeMotionVector)
 
 # yapf: enable
 
@@ -1466,3 +1466,17 @@ def test_audio_feature_selector(self):
         assert repr(audio_feature_selector) == (
             f'{audio_feature_selector.__class__.__name__}('
             f'fix_length={128})')
+
+    def test_pyav_decode_motion_vector(self):
+        pyav_init = PyAVInit()
+        pyav = PyAVDecodeMotionVector()
+
+        # test pyav
+        results = {
+            'filename':
+            osp.join(osp.dirname(osp.dirname(__file__)), 'data/test.mp4'),
+            'frame_inds':
+            np.arange(0, 32, 1)[:, np.newaxis]
+        }
+        results = pyav_init(results)
+        results = pyav(results)

From 247406d5fc6641b100e907619b159585406d82bd Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Wed, 18 Nov 2020 15:59:49 +0800
Subject: [PATCH 03/12] Remove FFMpegDecodeMotionVector and fix review issues.

Minor.
---
 ...50_1x1x3_75e_kinetics400_motion_vector.py} |  63 +++---
 mmaction/datasets/pipelines/loading.py        | 179 ++++++------------
 tests/test_data/test_loading.py               |  11 +-
 3 files changed, 89 insertions(+), 164 deletions(-)
 rename configs/recognition_motion_vector/{tsn_r50_1x1x3_75e_ucf101_motion_vector.py => tsn_r50_1x1x3_75e_kinetics400_motion_vector.py} (65%)

diff --git a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py
similarity index 65%
rename from configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py
rename to configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py
index 9edc45bb2b..ab9ea9fa45 100644
--- a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_ucf101_motion_vector.py
+++ b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py
@@ -19,19 +19,20 @@
 train_cfg = None
 test_cfg = dict(average_clips=None)
 # dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/ucf101/rawframes/'
-data_root_val = 'data/ucf101/rawframes/'
-split = 1  # official train/test splits. valid numbers: 1, 2, 3
-ann_file_train = f'data/ucf101/ucf101_train_split_{split}_rawframes.txt'
-ann_file_val = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
-ann_file_test = f'data/ucf101/ucf101_val_split_{split}_rawframes.txt'
+dataset_type = 'VideoDataset'
+data_root = 'data/kinetics400/videos_fast/'
+data_root_val = 'data/kinetics400/videos_fast/'
+ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
 img_norm_cfg = dict(
     mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
 train_pipeline = [
     dict(type='PyAVInit'),
-    dict(type='PyAVDecodeMotionVector'),
-    dict(type='Resize', scale=(-1, 16)),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
+    dict(type='PyAVDecodeMotionVector', multi_thread=True),
+    # dict(type='PyAVDecode'),
+    # dict(type='Resize', scale=(-1, 16)),
     dict(type='CenterCrop', crop_size=16),
     dict(type='Flip', flip_ratio=0.5),
     dict(type='FormatShape', input_format='NCHW'),
@@ -39,40 +40,32 @@
     dict(type='ToTensor', keys=['imgs', 'label'])
 ]
 val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=3,
-        test_mode=True),
-    dict(type='RawFrameDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=256),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
+    dict(type='PyAVInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
+    dict(type='PyAVDecodeMotionVector'),
+    # dict(type='PyAVDecode'),
+    # dict(type='Resize', scale=(-1, 16)),
+    dict(type='CenterCrop', crop_size=16),
+    dict(type='Flip', flip_ratio=0.5),
     dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
+    dict(type='Collect', keys=['imgs'], meta_keys=[]),
     dict(type='ToTensor', keys=['imgs'])
 ]
 test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=25,
-        test_mode=True),
-    dict(type='RawFrameDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='ThreeCrop', crop_size=256),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
+    dict(type='PyAVInit'),
+    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
+    dict(type='PyAVDecodeMotionVector'),
+    # dict(type='PyAVDecode'),
+    # dict(type='Resize', scale=(-1, 16)),
+    dict(type='CenterCrop', crop_size=16),
+    dict(type='Flip', flip_ratio=0.5),
     dict(type='FormatShape', input_format='NCHW'),
     dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
     dict(type='ToTensor', keys=['imgs'])
 ]
 data = dict(
-    videos_per_gpu=32,
-    workers_per_gpu=4,
+    videos_per_gpu=1,
+    workers_per_gpu=0,
     train=dict(
         type=dataset_type,
         ann_file=ann_file_train,
@@ -108,7 +101,7 @@
 # runtime settings
 dist_params = dict(backend='nccl')
 log_level = 'INFO'
-work_dir = f'./work_dirs/tsn_r50_1x1x3_75e_ucf101_split_{split}_rgb/'
+work_dir = 'word_dirs/tsn_r50_1x1x3_75e_kinetics400_motion_vector'
 load_from = None
 resume_from = None
 workflow = [('train', 1)]
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 8fe7d55be9..6e7912efca 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -756,14 +756,12 @@ def __repr__(self):
 
 
 @PIPELINES.register_module()
-class PyAVDecodeMotionVector(object):
+class PyAVDecode:
     """Using pyav to decode the video.
 
     PyAV: https://github.com/mikeboers/PyAV
-
     Required keys are "video_reader" and "frame_inds",
     added or modified keys are "imgs", "img_shape" and "original_shape".
-
     Args:
         multi_thread (bool): If set to True, it will apply multi
             thread processing. Default: False.
@@ -772,26 +770,6 @@ class PyAVDecodeMotionVector(object):
     def __init__(self, multi_thread=False):
         self.multi_thread = multi_thread
 
-    def parse_vectors(self, mv, vectors, height, width):
-        """Parse the returned vector."""
-        (w, h, src_x, src_y, dst_x,
-         dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],
-                   vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
-        val_x = dst_x - src_x
-        val_y = dst_y - src_y
-        start_x = (-1 * w / 2).astype(np.int8) + dst_x
-        start_y = (-1 * h / 2).astype(np.int8) + dst_y
-        end_x = start_x + w.astype(np.int8)
-        end_y = start_y + h.astype(np.int8)
-        for row in range(len(vectors)):
-            if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0
-                    and end_y[row] < height):
-                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
-                   0] = val_x[row]
-                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
-                   1] = val_y[row]
-        return mv
-
     def __call__(self, results):
         """Perform the PyAV decoding.
 
@@ -810,115 +788,63 @@ def __call__(self, results):
         # set max indice to make early stop
         max_inds = max(results['frame_inds'])
         i = 0
-        stream = container.streams.video[0]
-        codec_context = stream.codec_context
-        codec_context.options = {'flags2': '+export_mvs'}
-        # import imageio
-        for packet in container.demux(stream):
-            for frame in packet.decode():
-                if i > max_inds + 1:
-                    break
-                i += 1
-                height = frame.height
-                width = frame.width
-                mv = np.zeros((height, width, 2), dtype=np.int8)
-                vectors = frame.side_data.get('MOTION_VECTORS')
-                if frame.key_frame:
-                    # Key frame don't have motion vectors
-                    assert vectors is None
-                else:
-                    assert len(vectors) > 0
-                    mv = self.parse_vectors(mv, vectors.to_ndarray(), height,
-                                            width)
-                # imageio.imwrite(f'tmp/gray/{i}.jpg',
-                # np.abs(mv[:, :, 0]) + np.abs(mv[:, :, 1]))
-                imgs.append(mv)
+        for frame in container.decode(video=0):
+            if i > max_inds + 1:
+                break
+            imgs.append(frame.to_rgb().to_ndarray())
+            i += 1
 
         results['video_reader'] = None
         del container
 
         # the available frame in pyav may be less than its length,
         # which may raise error
-        results['mvs'] = np.array(
-            [imgs[i % len(imgs)] for i in results['frame_inds']])
-        print(np.max(results['mvs']))
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(multi_thread={self.multi_thread})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class FFmpegDecodeMotionVector(object):
-    """
-        Ref:
-            https://github.com/chaoyuaw/pytorch-coviar.
-    """
-
-    def __init__(self, residual=False, gop_size=16):
-        self.residual = residual
-        self.gop_size = gop_size
-
-    def _clip_and_scale(self, img, size):
-        return (img * (127.5 / size) + 128).astype(np.int32)
+        results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']]
 
-    def __call__(self, results):
-        try:
-            import coviar.load
-        except ImportError:
-            print('Please install coviar first.')
-        if results['frame_inds'].ndim != 1:
-            results['frame_inds'] = np.squeeze(results['frame_inds'])
-        frame_inds = results['frame_inds']
-        video_path = results['filename']
-        motion_vectors = list()
-        for clip_idx in range(len(frame_inds)):
-            clip_mvs = list()
-            for frame_idx in frame_inds[clip_idx]:
-                gop_idx = frame_idx // self.gop_size
-                gop_pos = frame_idx % self.gop_size
-                if gop_pos == 0:
-                    gop_idx = -1
-                    gop_pos = self.gop_size
-                mv = coviar.load(video_path, gop_idx, gop_pos,
-                                 2 if self.residual else 1, False)
-
-                if mv is None:
-                    mv = np.zeros((256, 256, 2))
-                else:
-                    # FIXME: 20 is the scale of mv, should be tested,
-                    # but it's ok as long as it is a linearly proportional
-                    mv = self._clip_and_scale(mv, 20)
-                    mv = (np.minimum(np.maximum(mv, 0), 255)).astype(np.uint8)
-                clip_mvs.append(mv)
-            motion_vectors.append(clip_mvs)
-
-        results['motion_vectors'] = np.array(motion_vectors)
+        results['original_shape'] = imgs[0].shape[:2]
+        results['img_shape'] = imgs[0].shape[:2]
 
         return results
 
     def __repr__(self):
         repr_str = self.__class__.__name__
-        repr_str += f'(residual={self.residual}'
-        repr_str += f'gop_size={self.gop_size})'
+        repr_str += f'(multi_thread={self.multi_thread})'
         return repr_str
 
 
-class PyAVDecode(object):
-    """Using pyav to decode the video.
+@PIPELINES.register_module()
+class PyAVDecodeMotionVector(PyAVDecode):
+    """Using pyav to decode the motion vectors from video.
 
     PyAV: https://github.com/mikeboers/PyAV
+
     Required keys are "video_reader" and "frame_inds",
     added or modified keys are "imgs", "img_shape" and "original_shape".
+
     Args:
         multi_thread (bool): If set to True, it will apply multi
             thread processing. Default: False.
     """
 
-    def __init__(self, multi_thread=False):
-        self.multi_thread = multi_thread
+    def _parse_vectors(self, mv, vectors, height, width):
+        """Parse the returned vector."""
+        (w, h, src_x, src_y, dst_x,
+         dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],
+                   vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
+        val_x = dst_x - src_x
+        val_y = dst_y - src_y
+        start_x = (-1 * w / 2).astype(np.int8) + dst_x
+        start_y = (-1 * h / 2).astype(np.int8) + dst_y
+        end_x = start_x + w.astype(np.int8)
+        end_y = start_y + h.astype(np.int8)
+        for row in range(len(vectors)):
+            if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0
+                    and end_y[row] < height):
+                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
+                   0] = val_x[row]
+                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
+                   1] = val_y[row]
+        return mv
 
     def __call__(self, results):
         """Perform the PyAV decoding.
@@ -938,29 +864,36 @@ def __call__(self, results):
         # set max indice to make early stop
         max_inds = max(results['frame_inds'])
         i = 0
-        for frame in container.decode(video=0):
-            if i > max_inds + 1:
-                break
-            imgs.append(frame.to_rgb().to_ndarray())
-            i += 1
+        stream = container.streams.video[0]
+        codec_context = stream.codec_context
+        codec_context.options = {'flags2': '+export_mvs'}
+        # import imageio
+        for packet in container.demux(stream):
+            for frame in packet.decode():
+                if i > max_inds + 1:
+                    break
+                i += 1
+                height = frame.height
+                width = frame.width
+                mv = np.zeros((height, width, 2), dtype=np.int8)
+                vectors = frame.side_data.get('MOTION_VECTORS')
+                if frame.key_frame:
+                    # Key frame don't have motion vectors
+                    assert vectors is None
+                if vectors is not None and len(vectors) > 0:
+                    mv = self._parse_vectors(mv, vectors.to_ndarray(), height,
+                                             width)
+                imgs.append(mv)
 
         results['video_reader'] = None
         del container
 
         # the available frame in pyav may be less than its length,
         # which may raise error
-        results['imgs'] = [imgs[i % len(imgs)] for i in results['frame_inds']]
-
-        results['original_shape'] = imgs[0].shape[:2]
-        results['img_shape'] = imgs[0].shape[:2]
-
+        results['imgs'] = np.array(
+            [imgs[i % len(imgs)] for i in results['frame_inds']])
         return results
 
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(multi_thread={self.multi_thread})'
-        return repr_str
-
 
 @PIPELINES.register_module()
 class DecordInit:
diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py
index 8284bd15f1..d1979997c8 100644
--- a/tests/test_data/test_loading.py
+++ b/tests/test_data/test_loading.py
@@ -16,10 +16,11 @@
                                          LoadAudioFeature, LoadHVULabel,
                                          LoadLocalizationFeature,
                                          LoadProposals, OpenCVDecode,
-                                         OpenCVInit, PyAVDecode, PyAVInit,
+                                         OpenCVInit, PyAVDecode,
+                                         PyAVDecodeMotionVector, PyAVInit,
                                          RawFrameDecode, SampleAVAFrames,
                                          SampleFrames, SampleProposalFrames,
-                                         UntrimmedSampleFrames, PyAVDecodeMotionVector)
+                                         UntrimmedSampleFrames)
 
 # yapf: enable
 
@@ -1473,10 +1474,8 @@ def test_pyav_decode_motion_vector(self):
 
         # test pyav
         results = {
-            'filename':
-            osp.join(osp.dirname(osp.dirname(__file__)), 'data/test.mp4'),
-            'frame_inds':
-            np.arange(0, 32, 1)[:, np.newaxis]
+            'filename': self.video_path,
+            'frame_inds': np.arange(0, 32, 1)[:, None]
         }
         results = pyav_init(results)
         results = pyav(results)

From ee0a76a467ee73d866c350780d2ab2c11c541e0b Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Wed, 18 Nov 2020 16:04:45 +0800
Subject: [PATCH 04/12] Minor.

---
 ...r50_1x1x3_75e_kinetics400_motion_vector.py | 107 ------------------
 mmaction/datasets/pipelines/loading.py        |   4 +-
 tests/test_data/test_loading.py               |   2 +
 3 files changed, 4 insertions(+), 109 deletions(-)
 delete mode 100644 configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py

diff --git a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py b/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py
deleted file mode 100644
index ab9ea9fa45..0000000000
--- a/configs/recognition_motion_vector/tsn_r50_1x1x3_75e_kinetics400_motion_vector.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNet',
-        pretrained='torchvision://resnet50',
-        in_channels=2,
-        depth=18,
-        norm_eval=False),
-    cls_head=dict(
-        type='TSNHead',
-        num_classes=101,
-        in_channels=512,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.4,
-        init_std=0.001))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 'data/kinetics400/videos_fast/'
-data_root_val = 'data/kinetics400/videos_fast/'
-ann_file_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
-ann_file_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
-ann_file_test = 'data/kinetics400/kinetics400_val_list_videos.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='PyAVInit'),
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
-    dict(type='PyAVDecodeMotionVector', multi_thread=True),
-    # dict(type='PyAVDecode'),
-    # dict(type='Resize', scale=(-1, 16)),
-    dict(type='CenterCrop', crop_size=16),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='PyAVInit'),
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
-    dict(type='PyAVDecodeMotionVector'),
-    # dict(type='PyAVDecode'),
-    # dict(type='Resize', scale=(-1, 16)),
-    dict(type='CenterCrop', crop_size=16),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='PyAVInit'),
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
-    dict(type='PyAVDecodeMotionVector'),
-    # dict(type='PyAVDecode'),
-    # dict(type='Resize', scale=(-1, 16)),
-    dict(type='CenterCrop', crop_size=16),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=1,
-    workers_per_gpu=0,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.00128, momentum=0.9,
-    weight_decay=0.0005)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[])
-total_epochs = 75
-checkpoint_config = dict(interval=5)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = 'word_dirs/tsn_r50_1x1x3_75e_kinetics400_motion_vector'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 6e7912efca..8133db41e4 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -819,7 +819,7 @@ class PyAVDecodeMotionVector(PyAVDecode):
     PyAV: https://github.com/mikeboers/PyAV
 
     Required keys are "video_reader" and "frame_inds",
-    added or modified keys are "imgs", "img_shape" and "original_shape".
+    added or modified keys are "mvs".
 
     Args:
         multi_thread (bool): If set to True, it will apply multi
@@ -890,7 +890,7 @@ def __call__(self, results):
 
         # the available frame in pyav may be less than its length,
         # which may raise error
-        results['imgs'] = np.array(
+        results['mvs'] = np.array(
             [imgs[i % len(imgs)] for i in results['frame_inds']])
         return results
 
diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py
index d1979997c8..c69c20235f 100644
--- a/tests/test_data/test_loading.py
+++ b/tests/test_data/test_loading.py
@@ -1479,3 +1479,5 @@ def test_pyav_decode_motion_vector(self):
         }
         results = pyav_init(results)
         results = pyav(results)
+        target_keys = ['mvs']
+        assert self.check_keys_contain(results.keys(), target_keys)

From f8ed428e13aac7e4430a667f6e679229626219b2 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Wed, 18 Nov 2020 16:07:56 +0800
Subject: [PATCH 05/12] Minor fix.

---
 mmaction/datasets/pipelines/__init__.py | 5 +++--
 mmaction/datasets/pipelines/loading.py  | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py
index bab313b05e..ae3cd1a257 100644
--- a/mmaction/datasets/pipelines/__init__.py
+++ b/mmaction/datasets/pipelines/__init__.py
@@ -14,8 +14,9 @@
                       GenerateLocalizationLabels, ImageDecode,
                       LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature,
                       LoadProposals, OpenCVDecode, OpenCVInit, PyAVDecode,
-                      PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
-                      SampleProposalFrames, UntrimmedSampleFrames, PyAVDecodeMotionVector)
+                      PyAVDecodeMotionVector, PyAVInit, RawFrameDecode,
+                      SampleAVAFrames, SampleFrames, SampleProposalFrames,
+                      UntrimmedSampleFrames)
 
 __all__ = [
     'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames',
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 8133db41e4..ed43c7f5fc 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -760,8 +760,10 @@ class PyAVDecode:
     """Using pyav to decode the video.
 
     PyAV: https://github.com/mikeboers/PyAV
+
     Required keys are "video_reader" and "frame_inds",
     added or modified keys are "imgs", "img_shape" and "original_shape".
+
     Args:
         multi_thread (bool): If set to True, it will apply multi
             thread processing. Default: False.

From bb468321c80d3c47da98ebdffade5dbd99da8991 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Tue, 24 Nov 2020 17:05:04 +0800
Subject: [PATCH 06/12] Minor fix and change changelog.

---
 docs/changelog.md                      | 2 +-
 mmaction/datasets/pipelines/loading.py | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 1db2de4b26..1ab50763e4 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -9,7 +9,7 @@
 - Support GYM99 data preparation ([#331](https://github.com/open-mmlab/mmaction2/pull/331))
 - Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324))
 - Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345))
-
+- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291))
 **Improvements**
 - Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312))
 - Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274))
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index ed43c7f5fc..416ab66b31 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -842,10 +842,9 @@ def _parse_vectors(self, mv, vectors, height, width):
         for row in range(len(vectors)):
             if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0
                     and end_y[row] < height):
-                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
-                   0] = val_x[row]
-                mv[start_y[row]:end_y[row], start_x[row]:end_x[row],
-                   1] = val_y[row]
+                mv[start_y[row]:end_y[row],
+                   start_x[row]:end_x[row]] = (val_x[row], val_y[row])
+
         return mv
 
     def __call__(self, results):

From d8fb864f356100d03706b8f7e2ef166eec927d73 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Tue, 24 Nov 2020 17:21:34 +0800
Subject: [PATCH 07/12] Minor fix for pyint.

---
 mmaction/datasets/pipelines/loading.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 416ab66b31..23b781f4d0 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -829,7 +829,7 @@ class PyAVDecodeMotionVector(PyAVDecode):
     """
 
     def _parse_vectors(self, mv, vectors, height, width):
-        """Parse the returned vector."""
+        """Parse the returned vectors."""
         (w, h, src_x, src_y, dst_x,
          dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],
                    vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
@@ -839,7 +839,7 @@ def _parse_vectors(self, mv, vectors, height, width):
         start_y = (-1 * h / 2).astype(np.int8) + dst_y
         end_x = start_x + w.astype(np.int8)
         end_y = start_y + h.astype(np.int8)
-        for row in range(len(vectors)):
+        for row, _ in enumerate(vectors):
             if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0
                     and end_y[row] < height):
                 mv[start_y[row]:end_y[row],
@@ -848,7 +848,7 @@ def _parse_vectors(self, mv, vectors, height, width):
         return mv
 
     def __call__(self, results):
-        """Perform the PyAV decoding.
+        """Perform the PyAV motion vector decoding.
 
         Args:
             results (dict): The resulting dict to be modified and passed

From 1ea8e76c9a43118dc625f11951b625a5f55811f6 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Sat, 28 Nov 2020 21:14:25 +0800
Subject: [PATCH 08/12] Rename the field mvs.

Minor fix changelog.
---
 docs/changelog.md                      | 1 +
 mmaction/datasets/pipelines/loading.py | 4 ++--
 tests/test_data/test_loading.py        | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 1ab50763e4..565c24e0a8 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -10,6 +10,7 @@
 - Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324))
 - Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345))
 - Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291))
+
 **Improvements**
 - Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312))
 - Support to predict different labels in a long video ([#274](https://github.com/open-mmlab/mmaction2/pull/274))
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 23b781f4d0..0deac1f60b 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -821,7 +821,7 @@ class PyAVDecodeMotionVector(PyAVDecode):
     PyAV: https://github.com/mikeboers/PyAV
 
     Required keys are "video_reader" and "frame_inds",
-    added or modified keys are "mvs".
+    added or modified keys are "motion_vectors".
 
     Args:
         multi_thread (bool): If set to True, it will apply multi
@@ -891,7 +891,7 @@ def __call__(self, results):
 
         # the available frame in pyav may be less than its length,
         # which may raise error
-        results['mvs'] = np.array(
+        results['motion_vectors'] = np.array(
             [imgs[i % len(imgs)] for i in results['frame_inds']])
         return results
 
diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py
index c69c20235f..f6d98bcc23 100644
--- a/tests/test_data/test_loading.py
+++ b/tests/test_data/test_loading.py
@@ -1479,5 +1479,5 @@ def test_pyav_decode_motion_vector(self):
         }
         results = pyav_init(results)
         results = pyav(results)
-        target_keys = ['mvs']
+        target_keys = ['motion_vectors']
         assert self.check_keys_contain(results.keys(), target_keys)

From 1c4a7bf9066a2aa0c3d60bbd4c37e204cd9b3c55 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Sat, 28 Nov 2020 21:25:30 +0800
Subject: [PATCH 09/12] More unittest.

---
 mmaction/datasets/pipelines/loading.py |  1 -
 tests/test_data/test_loading.py        | 16 ++++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 0deac1f60b..639fd6d6e8 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -868,7 +868,6 @@ def __call__(self, results):
         stream = container.streams.video[0]
         codec_context = stream.codec_context
         codec_context.options = {'flags2': '+export_mvs'}
-        # import imageio
         for packet in container.demux(stream):
             for frame in packet.decode():
                 if i > max_inds + 1:
diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py
index f6d98bcc23..2b58bb4c57 100644
--- a/tests/test_data/test_loading.py
+++ b/tests/test_data/test_loading.py
@@ -1472,12 +1472,24 @@ def test_pyav_decode_motion_vector(self):
         pyav_init = PyAVInit()
         pyav = PyAVDecodeMotionVector()
 
-        # test pyav
+        # test pyav with 2-dim input
         results = {
             'filename': self.video_path,
-            'frame_inds': np.arange(0, 32, 1)[:, None]
+            'frame_inds': np.arange(0, 32, 1)[:, np.newaxis]
         }
         results = pyav_init(results)
         results = pyav(results)
         target_keys = ['motion_vectors']
         assert self.check_keys_contain(results.keys(), target_keys)
+
+        # test pyav with 1 dim input
+        results = {
+            'filename': self.video_path,
+            'frame_inds': np.arange(0, 32, 1)
+        }
+        pyav_init = PyAVInit()
+        results = pyav_init(results)
+        pyav = PyAVDecodeMotionVector()
+        results = pyav(results)
+
+        assert self.check_keys_contain(results.keys(), target_keys)

From 00fd89c65d9bf025c395df7d62872341c39e2123 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Mon, 30 Nov 2020 20:24:23 +0800
Subject: [PATCH 10/12] Revised according to review.

Fix typo.
---
 mmaction/datasets/pipelines/loading.py | 28 +++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 639fd6d6e8..5681b81c77 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -818,10 +818,11 @@ def __repr__(self):
 class PyAVDecodeMotionVector(PyAVDecode):
     """Using pyav to decode the motion vectors from video.
 
-    PyAV: https://github.com/mikeboers/PyAV
+    Reference: https://github.com/PyAV-Org/PyAV/
+        blob/main/tests/test_decode.py
 
     Required keys are "video_reader" and "frame_inds",
-    added or modified keys are "motion_vectors".
+    added or modified keys are "motion_vectors", "frame_inds".
 
     Args:
         multi_thread (bool): If set to True, it will apply multi
@@ -835,15 +836,14 @@ def _parse_vectors(self, mv, vectors, height, width):
                    vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
         val_x = dst_x - src_x
         val_y = dst_y - src_y
-        start_x = (-1 * w / 2).astype(np.int8) + dst_x
-        start_y = (-1 * h / 2).astype(np.int8) + dst_y
-        end_x = start_x + w.astype(np.int8)
-        end_y = start_y + h.astype(np.int8)
-        for row, _ in enumerate(vectors):
-            if (start_x[row] >= 0 and end_x[row] < width and start_y[row] >= 0
-                    and end_y[row] < height):
-                mv[start_y[row]:end_y[row],
-                   start_x[row]:end_x[row]] = (val_x[row], val_y[row])
+        start_x = -1 * w // 2 + dst_x
+        start_y = -1 * h // 2 + dst_y
+        end_x = start_x + w
+        end_y = start_y + h
+        for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y,
+                                          val_x, val_y):
+            if (sx >= 0 and ex < width and sy >= 0 and ey < height):
+                mv[sy:ey, sx:ex] = (vx, vy)
 
         return mv
 
@@ -862,15 +862,15 @@ def __call__(self, results):
         if results['frame_inds'].ndim != 1:
             results['frame_inds'] = np.squeeze(results['frame_inds'])
 
-        # set max indice to make early stop
-        max_inds = max(results['frame_inds'])
+        # set max index to make early stop
+        max_idx = max(results['frame_inds'])
         i = 0
         stream = container.streams.video[0]
         codec_context = stream.codec_context
         codec_context.options = {'flags2': '+export_mvs'}
         for packet in container.demux(stream):
             for frame in packet.decode():
-                if i > max_inds + 1:
+                if i > max_idx + 1:
                     break
                 i += 1
                 height = frame.height

From 2eead76c8d46cad1370f12ccfad369c0c1a2e5f7 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Mon, 30 Nov 2020 21:03:34 +0800
Subject: [PATCH 11/12] Minor fix.

---
 mmaction/datasets/pipelines/loading.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 5681b81c77..15e88e1bd6 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -836,8 +836,8 @@ def _parse_vectors(self, mv, vectors, height, width):
                    vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
         val_x = dst_x - src_x
         val_y = dst_y - src_y
-        start_x = -1 * w // 2 + dst_x
-        start_y = -1 * h // 2 + dst_y
+        start_x = dst_x - w // 2
+        start_y = dst_y - h // 2
         end_x = start_x + w
         end_y = start_y + h
         for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y,

From f92ca9245b7074d47d744d996a4682bf4b3cda51 Mon Sep 17 00:00:00 2001
From: xusu <xusu@sensetime.com>
Date: Mon, 30 Nov 2020 21:20:59 +0800
Subject: [PATCH 12/12] parse vectors to staticmethod..

fix.
---
 mmaction/datasets/pipelines/loading.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
index 15e88e1bd6..631aec3d24 100644
--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -829,7 +829,8 @@ class PyAVDecodeMotionVector(PyAVDecode):
             thread processing. Default: False.
     """
 
-    def _parse_vectors(self, mv, vectors, height, width):
+    @staticmethod
+    def _parse_vectors(mv, vectors, height, width):
         """Parse the returned vectors."""
         (w, h, src_x, src_y, dst_x,
          dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],