open-mmlab · innerlee · Nov 30, 2020 · Oct 28, 2020 · Oct 29, 2020 · Nov 18, 2020
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -9,6 +9,7 @@
 - Support GYM99 data preparation ([#331](https://github.com/open-mmlab/mmaction2/pull/331))
 - Add GradCAM utils for recognizer ([#324](https://github.com/open-mmlab/mmaction2/pull/324))
 - Add print config script ([#345](https://github.com/open-mmlab/mmaction2/pull/345))
+- Add online motion vector decoder ([#291](https://github.com/open-mmlab/mmaction2/pull/291))
 
 **Improvements**
 - Support PyTorch 1.7 in CI ([#312](https://github.com/open-mmlab/mmaction2/pull/312))

diff --git a/mmaction/datasets/pipelines/__init__.py b/mmaction/datasets/pipelines/__init__.py
@@ -14,8 +14,9 @@
                       GenerateLocalizationLabels, ImageDecode,
                       LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature,
                       LoadProposals, OpenCVDecode, OpenCVInit, PyAVDecode,
-                      PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
-                      SampleProposalFrames, UntrimmedSampleFrames)
+                      PyAVDecodeMotionVector, PyAVInit, RawFrameDecode,
+                      SampleAVAFrames, SampleFrames, SampleProposalFrames,
+                      UntrimmedSampleFrames)
 
 __all__ = [
     'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames',
@@ -31,5 +32,5 @@
     'FormatAudioShape', 'LoadAudioFeature', 'AudioFeatureSelector',
     'AudioDecodeInit', 'EntityBoxPad', 'EntityBoxFlip', 'EntityBoxCrop',
     'EntityBoxRescale', 'EntityBoxClip', 'RandomScale', 'ImageDecode',
-    'BuildPseudoClip', 'RandomRescale'
+    'BuildPseudoClip', 'RandomRescale', 'PyAVDecodeMotionVector'
 ]
diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py
@@ -814,6 +814,87 @@ def __repr__(self):
         return repr_str
 
 
+@PIPELINES.register_module()
+class PyAVDecodeMotionVector(PyAVDecode):
+    """Using pyav to decode the motion vectors from video.
+
+    Reference: https://github.com/PyAV-Org/PyAV/
+        blob/main/tests/test_decode.py
+
+    Required keys are "video_reader" and "frame_inds",
+    added or modified keys are "motion_vectors", "frame_inds".
+
+    Args:
+        multi_thread (bool): If set to True, it will apply multi
+            thread processing. Default: False.
+    """
+
+    def _parse_vectors(self, mv, vectors, height, width):
+        """Parse the returned vectors."""
+        (w, h, src_x, src_y, dst_x,
+         dst_y) = (vectors['w'], vectors['h'], vectors['src_x'],
+                   vectors['src_y'], vectors['dst_x'], vectors['dst_y'])
+        val_x = dst_x - src_x
+        val_y = dst_y - src_y
+        start_x = -1 * w // 2 + dst_x
+        start_y = -1 * h // 2 + dst_y
+        end_x = start_x + w
+        end_y = start_y + h
+        for sx, ex, sy, ey, vx, vy in zip(start_x, end_x, start_y, end_y,
+                                          val_x, val_y):
+            if (sx >= 0 and ex < width and sy >= 0 and ey < height):
+                mv[sy:ey, sx:ex] = (vx, vy)
+
+        return mv
+
+    def __call__(self, results):
+        """Perform the PyAV motion vector decoding.
+
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        container = results['video_reader']
+        imgs = list()
+
+        if self.multi_thread:
+            container.streams.video[0].thread_type = 'AUTO'
+        if results['frame_inds'].ndim != 1:
+            results['frame_inds'] = np.squeeze(results['frame_inds'])
+
+        # set max index to make early stop
+        max_idx = max(results['frame_inds'])
+        i = 0
+        stream = container.streams.video[0]
+        codec_context = stream.codec_context
+        codec_context.options = {'flags2': '+export_mvs'}
+        for packet in container.demux(stream):
+            for frame in packet.decode():
+                if i > max_idx + 1:
+                    break
+                i += 1
+                height = frame.height
+                width = frame.width
+                mv = np.zeros((height, width, 2), dtype=np.int8)
+                vectors = frame.side_data.get('MOTION_VECTORS')
+                if frame.key_frame:
+                    # Key frame don't have motion vectors
+                    assert vectors is None
+                if vectors is not None and len(vectors) > 0:
+                    mv = self._parse_vectors(mv, vectors.to_ndarray(), height,
+                                             width)
+                imgs.append(mv)
+
+        results['video_reader'] = None
+        del container
+
+        # the available frame in pyav may be less than its length,
+        # which may raise error
+        results['motion_vectors'] = np.array(
+            [imgs[i % len(imgs)] for i in results['frame_inds']])
+        return results
+
+
 @PIPELINES.register_module()
 class DecordInit:
     """Using decord to initialize the video_reader.

diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py
@@ -16,7 +16,8 @@
                                          LoadAudioFeature, LoadHVULabel,
                                          LoadLocalizationFeature,
                                          LoadProposals, OpenCVDecode,
-                                         OpenCVInit, PyAVDecode, PyAVInit,
+                                         OpenCVInit, PyAVDecode,
+                                         PyAVDecodeMotionVector, PyAVInit,
                                          RawFrameDecode, SampleAVAFrames,
                                          SampleFrames, SampleProposalFrames,
                                          UntrimmedSampleFrames)
@@ -1466,3 +1467,29 @@ def test_audio_feature_selector(self):
         assert repr(audio_feature_selector) == (
             f'{audio_feature_selector.__class__.__name__}('
             f'fix_length={128})')
+
+    def test_pyav_decode_motion_vector(self):
+        pyav_init = PyAVInit()
+        pyav = PyAVDecodeMotionVector()
+
+        # test pyav with 2-dim input
+        results = {
+            'filename': self.video_path,
+            'frame_inds': np.arange(0, 32, 1)[:, np.newaxis]
+        }
+        results = pyav_init(results)
+        results = pyav(results)
+        target_keys = ['motion_vectors']
+        assert self.check_keys_contain(results.keys(), target_keys)
+
+        # test pyav with 1 dim input
+        results = {
+            'filename': self.video_path,
+            'frame_inds': np.arange(0, 32, 1)
+        }
+        pyav_init = PyAVInit()
+        results = pyav_init(results)
+        pyav = PyAVDecodeMotionVector()
+        results = pyav(results)
+
+        assert self.check_keys_contain(results.keys(), target_keys)