Color glitch/shift when reading video #506

r1d1shka · 2023-07-22T09:45:47Z

Hi!

I'm trying to implement Python tool for reading/writing video, but faced with the fact that when reading there is a color shift at the edges of moving objects. For example:

Should be:

Full input/output videos link:
https://drive.google.com/drive/folders/14Ja-pkASKReuD_OVh2ARR0FhGB0lTT8x?usp=sharing

Full test code:

import torch
import PyNvCodec as nvc
import cuda, cuda.cudart
import ffmpeg

def yuv_to_rgb(image: torch.Tensor) -> torch.Tensor:
    if not isinstance(image, torch.Tensor):
        raise TypeError(f"Input type is not a torch.Tensor. Got {type(image)}")

    if len(image.shape) < 3 or image.shape[-3] != 3:
        raise ValueError(f"Input size must have a shape of (*, 3, H, W). Got {image.shape}")

    if image.dtype != torch.uint8:
        raise ValueError(f"Expected to have uint8 Tensor but got {image.dtype}")

    image = image.float()
    y: torch.Tensor = image[..., 0, :, :] / 255.0
    u: torch.Tensor = image[..., 1, :, :] / 255.0 - 0.5
    v: torch.Tensor = image[..., 2, :, :] / 255.0 - 0.5

    r: torch.Tensor = y + 1.14 * v  # coefficient for g is 0
    g: torch.Tensor = y + -0.396 * u - 0.581 * v
    b: torch.Tensor = y + 2.029 * u  # coefficient for b is 0

    out: torch.Tensor = torch.stack([r, g, b], -3)

    return torch.clamp(out * 255, 0, 255).to(torch.uint8)


class VpfReader:
    def __init__(self,
                 source: str,
                 dst_w: int = 0,
                 dst_h: int = 0,
                 gpu_id: int = 0,
                 as_mask: bool = False):
        self.nvDec = nvc.PyNvDecoder(source, gpu_id, dst_w, dst_h)
        
        self.src_w, self.src_h = self.nvDec.Width(), self.nvDec.Height()
        self.dst_w = dst_w if dst_w != 0 else self.src_w
        self.dst_h = dst_h if dst_h != 0 else self.src_h

        # Converter from NV12 which is Nvdec native pixel fomat.
        self.to_yuv = nvc.PySurfaceConverter(
            self.src_w, self.src_h, nvc.PixelFormat.NV12, nvc.PixelFormat.YUV420, gpu_id
        )
        if self.dst_w != self.src_w or self.dst_h != self.src_h:
            print("Using PySurfaceResizer")
            self.resize = nvc.PySurfaceResizer(
                self.dst_w, self.dst_h, nvc.PixelFormat.YUV420, gpu_id
            )
        self.to_bgr = nvc.PySurfaceConverter(
            self.dst_w, self.dst_h, nvc.PixelFormat.YUV420, nvc.PixelFormat.RGB, gpu_id
        )

        # Converter from RGB to planar RGB because that's the way
        # pytorch likes to store the data in it's tensors.
        self.to_pln = nvc.PySurfaceConverter(
            self.dst_w, self.dst_h, nvc.PixelFormat.RGB, nvc.PixelFormat.RGB_PLANAR, gpu_id
        )

        self.cc_ctx = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG)
        self.cc_ctx_next = nvc.ColorspaceConversionContext(nvc.ColorSpace.BT_601, nvc.ColorRange.MPEG)

        self.it = 0
        self.as_mask = as_mask
        self.gpu_id = gpu_id
        
        self.ret2 = None

    @staticmethod
    def get_video_sizes(source: str, gpu_id: int = 0) -> Tuple[int, int, int]:
        nvDec = nvc.PyNvDecoder(source, gpu_id)
        return nvDec.Numframes(), nvDec.Width(), nvDec.Height()

    def get_video_fps(self) -> float:
        return self.nvDec.Framerate()

    def next_frame(self, signal_error: bool = True) -> Optional[torch.Tensor]:
        return self._next_frame(signal_error)

    def _next_frame(self, signal_error: bool = True) -> Optional[torch.Tensor]:        
        cuda.cudart.cudaSetDeviceFlags(cuda.cudart.cudaDeviceScheduleBlockingSync)
        cuda.cudart.cudaDeviceSynchronize()
        nv12_surface = self.nvDec.DecodeSingleSurface()
        if nv12_surface.Empty():
            if signal_error:
                raise RuntimeError("Cannot decode frame")
            else:
                return None
        
        yuv = self.to_yuv.Execute(nv12_surface, self.cc_ctx)
        if yuv.Empty():
            raise RuntimeError("Cannot convert nv12 to yuv")
        
        if self.dst_w != self.src_w or self.dst_h != self.src_h:
            yuv_small = self.resize.Execute(yuv)
            if yuv_small.Empty():
                raise RuntimeError("Cannot resize yuv")
        else:
            yuv_small = yuv
        
        rgb24_small = self.to_bgr.Execute(yuv_small, self.cc_ctx)
        if rgb24_small.Empty():
            raise RuntimeError("Cannot convert yuv to rgb")
        
        rgb24_planar = self.to_pln.Execute(rgb24_small, self.cc_ctx)
        if rgb24_planar.Empty():
            raise RuntimeError("Cannot convert rgb to plain")
        
        surf_plane = rgb24_planar.PlanePtr()
        img_tensor = self._create_tensor(
            surf_plane.GpuMem(),
            surf_plane.Width(),
            surf_plane.Height(),
            surf_plane.Pitch(),
            surf_plane.ElemSize(),
        )        
        img_tensor.resize_(3, self.dst_h, self.dst_w)
        if self.as_mask:
            # For removing reference for all channels
            img_tensor = img_tensor[:, 0:1, :, :].clone()
        
        return img_tensor
        
    def _create_tensor(self, gpu_mem, width, height, pitch, elem_size):
        assert elem_size == 1, "Only torch::kUInt8 data type is supported"

        tensor = torch.ones(())
        tensor = tensor.new_empty((height, width), dtype=torch.uint8, 
            device=torch.device('cuda', self.gpu_id))

        tensor_mem = tensor.data_ptr()
        ret = cuda.cudart.cudaMemcpy2D(tensor_mem, width, gpu_mem, pitch, 
            width, height, cuda.cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice)
        assert isinstance(ret, tuple)
        if ret[0] != cuda.cudart.cudaError_t.cudaSuccess:
            raise RuntimeError(f"Cuda error: {ret}")

        return tensor
		
class Tester():
    assert torch.cuda.is_available()
    
    @staticmethod
    def _calc_processing_size(original_size):
        main_processing_size = 512
        
        scale_x, scale_y = original_size[0] / main_processing_size,\
            original_size[1] / main_processing_size
        scale_max = max(scale_x, scale_y)
        scale_max = max(scale_max, 1.0)
        proc_size = int(original_size[0] / scale_max), int(original_size[1] / scale_max)
        return proc_size

    def test_read(self):
        path = "zebra.mp4"
        frames = []
        frames_count, width, height = VpfReader.get_video_sizes(path)
        original_size = width, height
        processing_size = self._calc_processing_size(original_size)
        
        reader = VpfReader(path)
        
        while True:
            frame = reader.next_frame(signal_error=False)
            if frame is None:
                break

            frames.append(frame)
            #time.sleep(0.25)
        after = current_milli_time()               
        self._save(frames, "zebra_out.mp4")

    def _save(self, frames, path):
        process = ffmpeg.input('pipe:',
                               format='rawvideo',
                               pix_fmt='rgb24',
                               s='{}x{}'.format(frames[0].shape[2], frames[0].shape[1]))\
            .output(path, pix_fmt='yuv420p', vcodec='libx264', crf=1)\
            .overwrite_output()\
            .run_async(pipe_stdin=True, quiet=False)
            
        for frame in frames:
            frame = frame.detach().cpu().numpy().astype(np.uint8)
            frame = cv.merge([frame[0], frame[1], frame[2]])
            process.stdin.write(frame)
        process.stdin.close()
        if process.stdout:
            process.stdout.close()
        if process.stderr:
            process.stderr.close()
        process.wait()

It is very interesting that if you add a pause when reading frames, then the problem disappears (like this):

while True:
        frame = reader.next_frame(signal_error=False)
        if frame is None:
            break
        frames.append(frame)
        time.sleep(0.25)

I'm working on Ubuntu 20.04 with Conda environment.
Some GPU information:

nvcc --version

    nvcc: NVIDIA (R) Cuda compiler driver
    Copyright (c) 2005-2023 NVIDIA Corporation
    Built on Tue_Jun_13_19:16:58_PDT_2023
    Cuda compilation tools, release 12.2, V12.2.91
    Build cuda_12.2.r12.2/compiler.32965470_0

 nvidia-smi

Thu Jul 20 19:33:43 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce GTX 1080 Ti     Off | 00000000:01:00.0  On |                  N/A |
|  0%   44C    P2              54W / 280W |    574MiB / 11264MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A       908      G   /usr/lib/xorg/Xorg                           53MiB |
|    0   N/A  N/A      1483      G   /usr/lib/xorg/Xorg                          289MiB |
|    0   N/A  N/A      1617      G   /usr/bin/gnome-shell                         51MiB |
|    0   N/A  N/A      4168      G   ...,WinRetrieveSuggestionsOnlyOnDemand       44MiB |
|    0   N/A  N/A    127421      G   ...00628519,2099990679082712135,262144      119MiB |
|    0   N/A  N/A    239861      G   ...ures=SpareRendererForSitePerProcess        2MiB |
+---------------------------------------------------------------------------------------+

The text was updated successfully, but these errors were encountered:

RomanArzumanyan · 2023-07-23T16:20:32Z

Hi @r1d1shka

It looks like race condition or memory object lifetime issues.
May I ask you what's the reason behind this code? It'm just curious.

cuda.cudart.cudaSetDeviceFlags(cuda.cudart.cudaDeviceScheduleBlockingSync)
cuda.cudart.cudaDeviceSynchronize()

Also, there's Surface > Tensor conversion which you can follow:

VideoProcessingFramework/samples/SampleTorchResnet.py

Lines 1129 to 1136 in 3347e55

    
           surf_plane = rgb24_planar.PlanePtr() 
        
           img_tensor = pnvc.makefromDevicePtrUint8( 
        
               surf_plane.GpuMem(), 
        
               surf_plane.Width(), 
        
               surf_plane.Height(), 
        
               surf_plane.Pitch(), 
        
               surf_plane.ElemSize(), 
        
           )

Beside that, you can convert your tensor back to nv12 surface and feed it to PyNvEncoder instead of tossing frames between RAM and vRAM and CPU-accelerated color conversion:

process = ffmpeg.input('pipe:',
            format='rawvideo',
            pix_fmt='rgb24',
            s='{}x{}'.format(frames[0].shape[2], frames[0].shape[1]))\
            .output(path, pix_fmt='yuv420p', vcodec='libx264', crf=1)\
            .overwrite_output()\
            .run_async(pipe_stdin=True, quiet=False)

P. S.
May I ask you what's the use case?
Rarely you can see a zebra on web camera )))

r1d1shka · 2023-07-24T09:31:35Z

Thanks for reply, Roman.
About

cuda.cudart.cudaSetDeviceFlags(cuda.cudart.cudaDeviceScheduleBlockingSync)
cuda.cudart.cudaDeviceSynchronize()

-- this is just an attempt to solve the synchronization problem. With or without these lines, the result is broken.

About

 surf_plane = rgb24_planar.PlanePtr() 
 img_tensor = pnvc.makefromDevicePtrUint8( 
     surf_plane.GpuMem(), 
     surf_plane.Width(), 
     surf_plane.Height(), 
     surf_plane.Pitch(), 
     surf_plane.ElemSize(), 
 )

Thank you, this allows me to make the code more compact and clearer, but the result is still broken.

About

process = ffmpeg.input('pipe:',
            format='rawvideo',
            pix_fmt='rgb24',
            s='{}x{}'.format(frames[0].shape[2], frames[0].shape[1]))\
            .output(path, pix_fmt='yuv420p', vcodec='libx264', crf=1)\
            .overwrite_output()\
            .run_async(pipe_stdin=True, quiet=False)

Thank you again, but writing is just for debug purposes.
I can use opencv imwrite for every frame like this:

t = tensor.permute(1, 2, 0)
cpu_data = t.detach().cpu().numpy()
cv.imwrite(path, cpu_data)

but it doesn't help to solve the synchronization problem...

About your question about zebra -- I took this pictures just for fun :)

RomanArzumanyan · 2023-07-24T10:17:13Z

Hi @r1d1shka

Coming back to the original topic - based on your code snippet it looks like your project does the same thing as https://github.com/NVIDIA/VideoProcessingFramework/blob/master/samples/SamplePyTorch.py
Did you try it?

r1d1shka · 2023-07-24T12:18:10Z

Yep, sample works fine.
Finally, I found mistake.
Need to fix this code

rgb24_planar = self.to_pln.Execute(rgb24_small, self.cc_ctx)
if rgb24_planar.Empty():
       raise RuntimeError("Cannot convert rgb to plain")

like this (add magic Clone()):

rgb24_planar = self.to_pln.Execute(rgb24_small, self.cc_ctx)
if rgb24_planar.Empty():
       raise RuntimeError("Cannot convert rgb to plain")
rgb24_planar = rgb24_planar.Clone()

Currently, I can't understand why this fix works.
Do you have any ideas?)

RomanArzumanyan · 2023-07-24T12:26:39Z

Hi @r1d1shka

There's nothing magical to it ))

Color converter class instance self.to_pln allocates memory just for 1 output frame to reduce the vRAM footprint.
If you want a deep copy you need to clone it. Otherwise it will simply reference one and the same Surface which actually belongs to color converter. Hence the shifts in the areas with movement.

To my best knowledge, there's no way to work this around because pybind11 relies on shared_ptr or unique_ptr to actual C++ class instances for memory management. That's the reason behind the Clone method which gives you a deep copy managed by Python interpreter, not underlying C++ libraries.

If you have any ideas on how to improve this memory management behavior - please LMK, I'm be happy to improve the codebase.

r1d1shka · 2023-07-24T12:46:22Z

Thank you very much, I think issue can be closed

RomanArzumanyan · 2023-07-24T12:58:45Z

Hi @r1d1shka
I can't close because I don't have moderator access but I assume you can do it as originator.

r1d1shka · 2023-07-24T13:03:33Z

closed

r1d1shka closed this as completed Jul 24, 2023

RomanArzumanyan mentioned this issue Oct 24, 2023

makefromDevicePtrUint8 returns inaccurate data when running in multiple processes #546

Closed

darkAlert mentioned this issue Nov 20, 2023

How to properly decode multiple frames and convert them to PyTorch tensor #551

Closed

niujiabenbeng mentioned this issue Dec 1, 2023

how to avoid race condition when using PySurfaceConverter in multiple processes? #554

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Color glitch/shift when reading video #506

Color glitch/shift when reading video #506

r1d1shka commented Jul 22, 2023

RomanArzumanyan commented Jul 23, 2023 •

edited

Loading

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 24, 2023

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 24, 2023 •

edited

Loading

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 24, 2023

r1d1shka commented Jul 24, 2023

Color glitch/shift when reading video #506

Color glitch/shift when reading video #506

Comments

r1d1shka commented Jul 22, 2023

RomanArzumanyan commented Jul 23, 2023 • edited Loading

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 24, 2023

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 24, 2023 • edited Loading

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 24, 2023

r1d1shka commented Jul 24, 2023

RomanArzumanyan commented Jul 23, 2023 •

edited

Loading

RomanArzumanyan commented Jul 24, 2023 •

edited

Loading