From 44dbb84267cffad437bb141fb11c07b319ee6298 Mon Sep 17 00:00:00 2001
From: aangerma <avishag.angerman@intel.com>
Date: Sun, 25 Apr 2021 12:53:23 +0300
Subject: [PATCH] Added support for I411 on SSE.

---
 src/proc/color-formats-converter.cpp      | 244 +++++++++++++++++++++-
 src/proc/color-formats-converter.h        |   3 +
 src/sensor.cpp                            |   6 +
 unit-tests/algo/convertions/test-y411.cpp | 142 +++++++++++++
 4 files changed, 386 insertions(+), 9 deletions(-)
 create mode 100644 unit-tests/algo/convertions/test-y411.cpp
diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp
index 771f720be73..9817081cf86 100644
--- a/src/proc/color-formats-converter.cpp
+++ b/src/proc/color-formats-converter.cpp
@@ -59,6 +59,7 @@ namespace librealsense
             rgb[2] = clamp( ( 298 * c + 516 * d + 128 ) >> 8 );
         #undef clamp
     }
+
     // The bytes alignment on y411:
     // Y is luminance and U,V are chrome
     // each U,V are duplicated for 4 pixels
@@ -73,13 +74,230 @@ namespace librealsense
     // [L1-Y3 U1 V1] [L1-Y2 U1 V1] [L1-Y1 U0 V0] [L1-Y0 U0 V0]
     // [L1-Y7 U3 V3] [L1-Y6 U3 V3] [L0-Y5 U2 V2] [L0-Y4 U2 V2]
     // [L1-Y7 U3 V3] [L1-Y6 U3 V3] [L1-Y5 U2 V2] [L1-Y4 U2 V2]
-    void unpack_y411( byte * const dest[], const byte * s, int w, int h, int actual_size )
+    //
+    //https://www.fourcc.org/pixel-format/yuv-y411/
+
+    void inline unpack_y411_sse(byte * const dest, const byte * s, int w, int h, int actual_size)
+    {
+        auto n = w * h;
+        // working each iteration on 8 y411 pixels, and extract 4 rgb pixels from each one
+        // so we get 32 rgb pixels
+        assert(n % 32 == 0); // All currently supported color resolutions are multiples of 32 pixels. Could easily extend support to other resolutions by copying final n<32 pixels into a zero-padded buffer and recursively calling self for final iteration.
+
+        auto src = reinterpret_cast<const __m128i *>(s);
+        auto dst = reinterpret_cast<__m128i *>(dest);
+
+        const __m128i zero = _mm_set1_epi8(0);
+        const __m128i n100 = _mm_set1_epi16(100 << 4);
+        const __m128i n208 = _mm_set1_epi16(208 << 4);
+        const __m128i n298 = _mm_set1_epi16(298 << 4);
+        const __m128i n409 = _mm_set1_epi16(409 << 4);
+        const __m128i n516 = _mm_set1_epi16(516 << 4);
+
+        // shuffle to y,u,v of pixels 1-2
+        const __m128i shuffle_y_1_2_0 = _mm_setr_epi8(1, 2, 4, 5, 7, 8, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0);    // to get   yyyyyyyy00000000
+        const __m128i shuffle_u_1_2_0 = _mm_setr_epi8(0, 0, 0, 0, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   uuuuuuuu00000000
+        const __m128i shuffle_v_1_2_0 = _mm_setr_epi8(3, 3, 3, 3, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   vvvvvvvv00000000
+
+        // shuffle to y,u,v of pixels 3-4 - combination of registers 0 and 1
+        const __m128i shuffle_y_3_4_0 = _mm_setr_epi8(13, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);        // to get   yy00000000000000
+        const __m128i mask_y_3_4_0 = _mm_setr_epi8(-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);           // to zero the other bytes
+        const __m128i shuffle_u_3_4_0 = _mm_setr_epi8(12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   uuuu000000000000
+        const __m128i shuffle_v_3_4_0 = _mm_setr_epi8(15, 15, 15, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   vvvv000000000000
+        const __m128i shuffle_y_3_4_1 = _mm_setr_epi8(0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);          // to get   00yyyyyy00000000
+        const __m128i mask_y_3_4_1 = _mm_setr_epi8(0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        const __m128i shuffle_u_3_4_1 = _mm_setr_epi8(0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0);        // to get   0000uuuu00000000
+        const __m128i shuffle_v_3_4_1 = _mm_setr_epi8(0, 0, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0);        // to get   0000vvvv00000000
+
+        // shuffle to y,u,v of pixels 5-6- combination of registers 1 and 2
+        const __m128i shuffle_y_5_6_1 = _mm_setr_epi8(9, 10, 12, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   yyyyy00000000000
+        const __m128i mask_y_5_6_1 = _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        const __m128i shuffle_u_5_6_1 = _mm_setr_epi8(8, 8, 8, 8, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   uuuuuuuu00000000
+        const __m128i shuffle_v_5_6_1 = _mm_setr_epi8(11, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);      // to get   vvvv000000000000
+        const __m128i shuffle_y_5_6_2 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0);          // to get   00000yyy00000000
+        const __m128i mask_y_5_6_2 = _mm_setr_epi8(0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+        const __m128i shuffle_v_5_6_2 = _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0);          // to get   0000vvvv00000000
+
+        // shuffle to y,u,v of pixels 7-8
+        const __m128i shuffle_y_7_8_2 = _mm_setr_epi8(5, 6, 8, 9, 11, 12, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0);        // to get   yyyyyyyy00000000
+        const __m128i shuffle_u_7_8_2 = _mm_setr_epi8(4, 4, 4, 4, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0);        // to get   uuuuuuuu00000000
+        const __m128i shuffle_v_7_8_2 = _mm_setr_epi8(7, 7, 7, 7, 13, 13, 13, 13, 0, 0, 0, 0, 0, 0, 0, 0);        // to get   vvvvvvvv00000000
+
+        const __m128i mask_uv_0 = _mm_setr_epi8(-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        const __m128i mask_uv_1 = _mm_setr_epi8(0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0);
+
+//#pragma omp parallel for
+        for (int i = 0; i < n / 32; i++)
+        {
+            // Load 8 y411 pixels into 3 16-byte registers
+            __m128i s0 = _mm_loadu_si128(&src[i * 3]);
+            __m128i s1 = _mm_loadu_si128(&src[i * 3 + 1]);
+            __m128i s2 = _mm_loadu_si128(&src[i * 3 + 2]);
+
+            // pixels 1-2
+            __m128i pixel_y_1_2 = _mm_shuffle_epi8(s0, shuffle_y_1_2_0);
+            __m128i pixel_u_1_2 = _mm_shuffle_epi8(s0, shuffle_u_1_2_0);
+            __m128i pixel_v_1_2 = _mm_shuffle_epi8(s0, shuffle_v_1_2_0);
+
+            // pixels 3-4
+            __m128i pixel_y_3_4_register_0 = _mm_shuffle_epi8(s0, shuffle_y_3_4_0);
+            __m128i pixel_y_3_4_register_1 = _mm_shuffle_epi8(s1, shuffle_y_3_4_1);
+            pixel_y_3_4_register_0 = _mm_and_si128(pixel_y_3_4_register_0, mask_y_3_4_0);
+            pixel_y_3_4_register_1 = _mm_and_si128(pixel_y_3_4_register_1, mask_y_3_4_1);
+            __m128i pixel_y_3_4 = _mm_or_si128(pixel_y_3_4_register_0, pixel_y_3_4_register_1);
+
+            __m128i pixel_u_3_4_register_0 = _mm_shuffle_epi8(s0, shuffle_u_3_4_0);
+            __m128i pixel_u_3_4_register_1 = _mm_shuffle_epi8(s1, shuffle_u_3_4_1);
+            pixel_u_3_4_register_0 = _mm_and_si128(pixel_u_3_4_register_0, mask_uv_0);
+            pixel_u_3_4_register_1 = _mm_and_si128(pixel_u_3_4_register_1, mask_uv_1);
+            __m128i pixel_u_3_4 = _mm_or_si128(pixel_u_3_4_register_0, pixel_u_3_4_register_1);
+
+            __m128i pixel_v_3_4_register_0 = _mm_shuffle_epi8(s0, shuffle_v_3_4_0);
+            __m128i pixel_v_3_4_register_1 = _mm_shuffle_epi8(s1, shuffle_v_3_4_1);
+            pixel_v_3_4_register_0 = _mm_and_si128(pixel_v_3_4_register_0, mask_uv_0);
+            pixel_v_3_4_register_1 = _mm_and_si128(pixel_v_3_4_register_1, mask_uv_1);
+            __m128i pixel_v_3_4 = _mm_or_si128(pixel_v_3_4_register_0, pixel_v_3_4_register_1);
+
+            // pixels 5-6
+            __m128i pixel_y_5_6_register_1 = _mm_shuffle_epi8(s1, shuffle_y_5_6_1);
+            __m128i pixel_y_5_6_register_2 = _mm_shuffle_epi8(s2, shuffle_y_5_6_2);
+            pixel_y_5_6_register_1 = _mm_and_si128(pixel_y_5_6_register_1, mask_y_5_6_1);
+            pixel_y_5_6_register_2 = _mm_and_si128(pixel_y_5_6_register_2, mask_y_5_6_2);
+            __m128i pixel_y_5_6 = _mm_or_si128(pixel_y_5_6_register_1, pixel_y_5_6_register_2);
+
+            __m128i pixel_u_5_6_register_1 = _mm_shuffle_epi8(s1, shuffle_u_5_6_1);
+            __m128i mask_uv = _mm_or_si128(mask_uv_0, mask_uv_1);
+            __m128i pixel_u_5_6 = _mm_and_si128(pixel_u_5_6_register_1, mask_uv);
+
+            __m128i pixel_v_5_6_register_1 = _mm_shuffle_epi8(s1, shuffle_v_5_6_1);
+            __m128i pixel_v_5_6_register_2 = _mm_shuffle_epi8(s2, shuffle_v_5_6_2);
+            pixel_v_5_6_register_1 = _mm_and_si128(pixel_v_5_6_register_1, mask_uv_0);
+            pixel_v_5_6_register_2 = _mm_and_si128(pixel_v_5_6_register_2, mask_uv_1);
+            __m128i pixel_v_5_6 = _mm_or_si128(pixel_v_5_6_register_1, pixel_v_5_6_register_2);
+
+            // pixels 7-8
+            __m128i pixel_y_7_8 = _mm_shuffle_epi8(s2, shuffle_y_7_8_2);
+            __m128i pixel_u_7_8 = _mm_shuffle_epi8(s2, shuffle_u_7_8_2);
+            __m128i pixel_v_7_8 = _mm_shuffle_epi8(s2, shuffle_v_7_8_2);
+
+            // Retrieve all 32 Y components as 16-bit values (8 components per register))
+            // Retrieve all 8 u components as 16-bit values (2 components per register))
+            // Retrieve all 8 v components as 16-bit values (2 components per register))
+            __m128i y16_pix_1_2 = _mm_unpacklo_epi8(pixel_y_1_2, zero);         // convert to 16 bit
+            __m128i u16_pix_1_2 = _mm_unpacklo_epi8(pixel_u_1_2, zero);         // convert to 16 bit
+            __m128i v16_pix_1_2 = _mm_unpacklo_epi8(pixel_v_1_2, zero);
+
+            __m128i y16_pix_3_4 = _mm_unpacklo_epi8(pixel_y_3_4, zero);         // convert to 16 bit
+            __m128i u16_pix_3_4 = _mm_unpacklo_epi8(pixel_u_3_4, zero);                         // convert to 16 bit
+            __m128i v16_pix_3_4 = _mm_unpacklo_epi8(pixel_v_3_4, zero);
+
+            __m128i y16_pix_5_6 = _mm_unpacklo_epi8(pixel_y_5_6, zero);         // convert to 16 bit
+            __m128i u16_pix_5_6 = _mm_unpacklo_epi8(pixel_u_5_6, zero);                         // convert to 16 bit
+            __m128i v16_pix_5_6 = _mm_unpacklo_epi8(pixel_v_5_6, zero);
+
+            __m128i y16_pix_7_8 = _mm_unpacklo_epi8(pixel_y_7_8, zero);         // convert to 16 bit
+            __m128i u16_pix_7_8 = _mm_unpacklo_epi8(pixel_u_7_8, zero);                         // convert to 16 bit
+            __m128i v16_pix_7_8 = _mm_unpacklo_epi8(pixel_v_7_8, zero);
+
+            // r,g,b
+            __m128i c16_pix_1_2 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_1_2, _mm_set1_epi16(16)), 4);
+            __m128i d16_pix_1_2 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_1_2, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
+            __m128i e16_pix_1_2 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_1_2, _mm_set1_epi16(128)), 4);
+            __m128i r16_pix_1_2 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_1_2, n298), _mm_mulhi_epi16(e16_pix_1_2, n409))))));                                                 // (298 * c + 409 * e + 128) ; //
+            __m128i g16_pix_1_2 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_1_2, n298), _mm_mulhi_epi16(d16_pix_1_2, n100)), _mm_mulhi_epi16(e16_pix_1_2, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
+            __m128i b16_pix_1_2 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_1_2, n298), _mm_mulhi_epi16(d16_pix_1_2, n516))))));                                                 // clampbyte((298 * c + 516 * d + 128) >> 8);
+
+            __m128i c16_pix_3_4 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_3_4, _mm_set1_epi16(16)), 4);
+            __m128i d16_pix_3_4 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_3_4, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
+            __m128i e16_pix_3_4 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_3_4, _mm_set1_epi16(128)), 4);
+            __m128i r16_pix_3_4 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_3_4, n298), _mm_mulhi_epi16(e16_pix_3_4, n409))))));                                                 // (298 * c + 409 * e + 128) ; //
+            __m128i g16_pix_3_4 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_3_4, n298), _mm_mulhi_epi16(d16_pix_3_4, n100)), _mm_mulhi_epi16(e16_pix_3_4, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
+            __m128i b16_pix_3_4 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_3_4, n298), _mm_mulhi_epi16(d16_pix_3_4, n516))))));                                                 // clampbyte((298 * c + 516 * d + 128) >> 8);
+
+            __m128i c16_pix_5_6 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_5_6, _mm_set1_epi16(16)), 4);
+            __m128i d16_pix_5_6 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_5_6, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
+            __m128i e16_pix_5_6 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_5_6, _mm_set1_epi16(128)), 4);
+            __m128i r16_pix_5_6 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_5_6, n298), _mm_mulhi_epi16(e16_pix_5_6, n409))))));                                                 // (298 * c + 409 * e + 128) ; //
+            __m128i g16_pix_5_6 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_5_6, n298), _mm_mulhi_epi16(d16_pix_5_6, n100)), _mm_mulhi_epi16(e16_pix_5_6, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
+            __m128i b16_pix_5_6 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_5_6, n298), _mm_mulhi_epi16(d16_pix_5_6, n516))))));                                                 // clampbyte((298 * c + 516 * d + 128) >> 8);
+
+            __m128i c16_pix_7_8 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_7_8, _mm_set1_epi16(16)), 4);
+            __m128i d16_pix_7_8 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_7_8, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication
+            __m128i e16_pix_7_8 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_7_8, _mm_set1_epi16(128)), 4);
+            __m128i r16_pix_7_8 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_7_8, n298), _mm_mulhi_epi16(e16_pix_7_8, n409))))));                                                 // (298 * c + 409 * e + 128) ; //
+            __m128i g16_pix_7_8 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_7_8, n298), _mm_mulhi_epi16(d16_pix_7_8, n100)), _mm_mulhi_epi16(e16_pix_7_8, n208)))))); // (298 * c - 100 * d - 208 * e + 128)
+            __m128i b16_pix_7_8 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_7_8, n298), _mm_mulhi_epi16(d16_pix_7_8, n516))))));                                                 // clampbyte((298 * c + 516 * d + 128) >> 8);
+
+            // Shuffle separate R, G, B values into four registers storing four pixels each in (R, G, B, A) order
+            const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
+
+            __m128i rg8_pix_1_2 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_1_2, evens_odds), _mm_shuffle_epi8(g16_pix_1_2, evens_odds)); // hi to take the odds which are the upper bytes we care about
+            __m128i ba8_pix_1_2 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_1_2, evens_odds), _mm_set1_epi8(-1));
+            __m128i rg8_pix_3_4 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_3_4, evens_odds), _mm_shuffle_epi8(g16_pix_3_4, evens_odds)); // hi to take the odds which are the upper bytes we care about
+            __m128i ba8_pix_3_4 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_3_4, evens_odds), _mm_set1_epi8(-1));
+            __m128i rg8_pix_5_6 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_5_6, evens_odds), _mm_shuffle_epi8(g16_pix_5_6, evens_odds)); // hi to take the odds which are the upper bytes we care about
+            __m128i ba8_pix_5_6 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_5_6, evens_odds), _mm_set1_epi8(-1));
+            __m128i rg8_pix_7_8 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_7_8, evens_odds), _mm_shuffle_epi8(g16_pix_7_8, evens_odds)); // hi to take the odds which are the upper bytes we care about
+            __m128i ba8_pix_7_8 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_7_8, evens_odds), _mm_set1_epi8(-1));
+
+            __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8_pix_1_2, ba8_pix_1_2);
+            __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8_pix_1_2, ba8_pix_1_2);
+            __m128i rgba_8_11 = _mm_unpacklo_epi16(rg8_pix_3_4, ba8_pix_3_4);
+            __m128i rgba_12_15 = _mm_unpackhi_epi16(rg8_pix_3_4, ba8_pix_3_4);
+            __m128i rgba_16_19 = _mm_unpacklo_epi16(rg8_pix_5_6, ba8_pix_5_6);
+            __m128i rgba_20_23 = _mm_unpackhi_epi16(rg8_pix_5_6, ba8_pix_5_6);
+            __m128i rgba_24_27 = _mm_unpacklo_epi16(rg8_pix_7_8, ba8_pix_7_8);
+            __m128i rgba_28_32 = _mm_unpackhi_epi16(rg8_pix_7_8, ba8_pix_7_8);
+
+            // Shuffle rgb triples to the start and end of each register
+            __m128i rgba_0_7_l0 = _mm_unpacklo_epi64(rgba_0_3, rgba_4_7);
+            __m128i rgba_0_7_l1 = _mm_unpackhi_epi64(rgba_0_3, rgba_4_7);
+            __m128i rgba_8_15_l0 = _mm_unpacklo_epi64(rgba_8_11, rgba_12_15);
+            __m128i rgba_8_15_l1 = _mm_unpackhi_epi64(rgba_8_11, rgba_12_15);
+            __m128i rgba_16_23_l0 = _mm_unpacklo_epi64(rgba_16_19, rgba_20_23);
+            __m128i rgba_16_23_l1 = _mm_unpackhi_epi64(rgba_16_19, rgba_20_23);
+            __m128i rgba_24_32_l0 = _mm_unpacklo_epi64(rgba_24_27, rgba_28_32);
+            __m128i rgba_24_32_l1 = _mm_unpackhi_epi64(rgba_24_27, rgba_28_32);
+
+            // Shuffle rgb triples to the start and end of each register
+            __m128i rgb0_l0 = _mm_shuffle_epi8(rgba_0_7_l0, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
+            __m128i rgb1_l0 = _mm_shuffle_epi8(rgba_8_15_l0, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
+            __m128i rgb2_l0 = _mm_shuffle_epi8(rgba_16_23_l0, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
+            __m128i rgb3_l0 = _mm_shuffle_epi8(rgba_24_32_l0, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
+
+
+            // calculate the current line and column
+            auto num_on_regs_at_once = 3;
+            auto rgb_bpp = 3;
+            auto reg_num_on_line = w * rgb_bpp / 16;
+            auto line = (i*num_on_regs_at_once) / reg_num_on_line;
+            auto j = i % (reg_num_on_line / num_on_regs_at_once);
+
+            // Align registers and store 16 pixels (48 bytes) at once on the line above
+            _mm_storeu_si128(&dst[(line*2 ) *reg_num_on_line + j * 3], _mm_alignr_epi8(rgb1_l0, rgb0_l0, 4));
+            _mm_storeu_si128(&dst[(line*2 ) * reg_num_on_line + j * 3 + 1], _mm_alignr_epi8(rgb2_l0, rgb1_l0, 8));
+            _mm_storeu_si128(&dst[(line*2 ) * reg_num_on_line + j * 3 + 2], _mm_alignr_epi8(rgb3_l0, rgb2_l0, 12));
+
+            // Shuffle rgb triples to the start and end of each register
+            __m128i rgb0_l1 = _mm_shuffle_epi8(rgba_0_7_l1, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14));
+            __m128i rgb1_l1 = _mm_shuffle_epi8(rgba_8_15_l1, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14));
+            __m128i rgb2_l1 = _mm_shuffle_epi8(rgba_16_23_l1, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14));
+            __m128i rgb3_l1 = _mm_shuffle_epi8(rgba_24_32_l1, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15));
+
+            // Align registers and store 16 pixels(48 bytes) at once on the line bellow
+            _mm_storeu_si128(&dst[(line*2  + 1) *reg_num_on_line + j * 3], _mm_alignr_epi8(rgb1_l1, rgb0_l1, 4));
+            _mm_storeu_si128(&dst[(line*2  + 1) * reg_num_on_line + j * 3 + 1], _mm_alignr_epi8(rgb2_l1, rgb1_l1, 8));
+            _mm_storeu_si128(&dst[(line*2  + 1) * reg_num_on_line + j * 3 + 2], _mm_alignr_epi8(rgb3_l1, rgb2_l1, 12));
+        }
+    }
+    void unpack_y411_native(byte * const dest, const byte * s, int w, int h, int actual_size)
     {
-        auto out = dest[0];
+        auto out = dest;
+
         auto index_source = 0;
-        for( auto i = 0; i < h; i += 2 )
+        for (auto i = 0; i < h; i += 2)
         {
-            for( auto j = 0; j < w; j += 2 )
+            for (auto j = 0; j < w; j += 2)
             {
                 auto y411_pix = &s[index_source];
                 auto l0_u0 = y411_pix[0];
@@ -87,24 +305,32 @@ namespace librealsense
                 auto l0_y1 = y411_pix[2];
                 auto l0_v0 = y411_pix[3];
                 auto l1_y0 = y411_pix[4];
-                auto l1_y1 = y411_pix[4];
+                auto l1_y1 = y411_pix[5];
 
                 byte yuv0_0[3] = { l0_y0, l0_u0, l0_v0 };
-                convert_yuv_to_rgb( yuv0_0, &out[i * w * 3 + j * 3] );
+                convert_yuv_to_rgb(yuv0_0, &out[i * w * 3 + j * 3]);
 
                 byte yuv0_1[3] = { l0_y1, l0_u0, l0_v0 };
-                convert_yuv_to_rgb( yuv0_0, &out[i * w * 3 + j * 3 + 3 * 3] );
+                convert_yuv_to_rgb(yuv0_1, &out[i * w * 3 + j * 3 + 3]);
 
                 byte yuv1_0[3] = { l1_y0, l0_u0, l0_v0 };
-                convert_yuv_to_rgb( yuv1_0, &out[( i + 1 ) * w * 3 + j * 3] );
+                convert_yuv_to_rgb(yuv1_0, &out[(i + 1) * w * 3 + j * 3]);
 
                 byte yuv1_1[3] = { l1_y1, l0_u0, l0_v0 };
-                convert_yuv_to_rgb( yuv1_0, &out[( i + 1 ) * w * 3 + j * 3 + 3 * 3] );
+                convert_yuv_to_rgb(yuv1_1, &out[(i + 1) * w * 3 + j * 3 + 3]);
 
                 index_source += 6;
             }
         }
     }
+    void unpack_y411( byte * const dest[], const byte * s, int w, int h, int actual_size )
+    {
+#if defined __SSSE3__
+        unpack_y411_sse(dest[0], s, w, h, actual_size);
+#else
+        unpack_y411_native(dest[0], s, w, h, actual_size);
+#endif
+    }
 
     /////////////////////////////
     // YUY2 unpacking routines //
diff --git a/src/proc/color-formats-converter.h b/src/proc/color-formats-converter.h
index 66a5deb0155..7cbb3204c93 100644
--- a/src/proc/color-formats-converter.h
+++ b/src/proc/color-formats-converter.h
@@ -84,4 +84,7 @@ namespace librealsense
             int actual_size,
             int input_size) override;
     };
+    void unpack_y411(byte * const dest[], const byte * s, int w, int h, int actual_size);
+    void unpack_y411_sse(byte * const dest, const byte * s, int w, int h, int actual_size);
+    void unpack_y411_native(byte * const dest, const byte * s, int w, int h, int actual_size);
 }
diff --git a/src/sensor.cpp b/src/sensor.cpp
index 5d7bdacfab5..dea4b6fdddb 100644
--- a/src/sensor.cpp
+++ b/src/sensor.cpp
@@ -367,6 +367,12 @@ namespace librealsense
                             else
                                 LOG_DEBUG(expected_size << "expected size is smaller than " << sizeof(byte)*fr->data.size() << "actual size");
                         }
+
+                        memcpy((void*)fh->get_frame_data(), fr->data.data(), expected_size);
+                        auto&& video = (video_frame*)fh.frame;
+                        video->assign(width, height, width * bpp / 8, bpp);
+                        video->set_timestamp_domain(timestamp_domain);
+                        fh->set_stream(req_profile_base);
                     }
                     else
                     {
diff --git a/unit-tests/algo/convertions/test-y411.cpp b/unit-tests/algo/convertions/test-y411.cpp
new file mode 100644
index 00000000000..d723c89c0c4
--- /dev/null
+++ b/unit-tests/algo/convertions/test-y411.cpp
@@ -0,0 +1,142 @@
+// License: Apache 2.0. See LICENSE file in root directory.
+// Copyright(c) 2021 Intel Corporation. All Rights Reserved.
+
+//#cmake: static!
+//#cmake:add-file ../../../src/proc/color-formats-converter.cpp
+
+#include "../algo-common.h"
+#include <librealsense2/rsutil.h>
+#include "../src/types.h"
+#include "../src/proc/color-formats-converter.h"
+
+const int W = 16;
+const int H = 2;
+const int BPP_RGB = 24;
+const int BPP_YUV = 24;
+const int BPP_Y11 = 12;
+const int SOURCE_BYTES_Y411 = W * H*BPP_Y11 / 8;
+const int SOURCE_BYTES_YUV = W * H*BPP_YUV / 8;
+const int DEST_BYTES = W * H* BPP_RGB / 8;
+
+struct byte3
+{
+    byte data[3];
+    bool operator==(const byte3& other) const
+    {
+        return (int)data[0] == (int)other.data[0] && (int)data[1] == (int)other.data[1] && (int)data[2] == (int)other.data[2];
+    }
+};
+
+std::ostream & operator<<(std::ostream & os, const byte3& obj)
+{
+    os << (int)obj.data[0] << " " << (int)obj.data[1] << " " << (int)obj.data[2];
+    return os;
+}
+
+void convert_single_yuv_to_rgb(const byte3 yuv, byte3 *rgb)
+{
+    int32_t c = yuv.data[0] - 16;
+    int32_t d = yuv.data[1] - 128;
+    int32_t e = yuv.data[2] - 128;
+
+    int32_t t;
+#define clamp( x ) ( ( t = ( x ) ) > 255 ? 255 : t < 0 ? 0 : t )
+    rgb->data[0] = clamp((298 * c + 409 * e + 128) >> 8);
+    rgb->data[1] = clamp((298 * c - 100 * d - 208 * e + 128) >> 8);
+    rgb->data[2] = clamp((298 * c + 516 * d + 128) >> 8);
+#undef clamp
+}
+
+void convert_yuv_to_rgb(byte3 *rgb, const byte3 * yuv)
+{
+    for (auto i = 0; i < W*H; i++)
+    {
+        convert_single_yuv_to_rgb(yuv[i], &rgb[i]);
+    }
+}
+
+// to simulate _mm_mulhi_epi16
+int16_t mult_hi(int16_t x, int16_t y)
+{
+    auto x_i = (int)x;
+    auto y_i = (int)y;
+
+    auto tmp = x_i * y_i;
+
+    auto res = tmp >> 16;
+    return tmp >> 16;
+}
+
+void convert_single_yuv_to_rgb_sse(const byte3 yuv, byte3 *rgb)
+{
+    int16_t c = (yuv.data[0] - 16)<<4;
+    int16_t d = (yuv.data[1] - 128)<<4;
+    int16_t e = (yuv.data[2] - 128)<<4;
+
+    int16_t t;
+#define clamp( x ) ( ( t = ( x ) ) > 255 ? 255 : t < 0 ? 0 : t )
+    rgb->data[0] = clamp(mult_hi((298 << 4) , c) + mult_hi((409<<4) , e) );
+    rgb->data[1] = clamp(mult_hi((298<<4) , c) - mult_hi((100<<4), d) - mult_hi((208<<4), e) );
+    rgb->data[2] = clamp(mult_hi((298<<4) , c) + mult_hi((516<<4) , d));
+#undef clamp
+}
+
+void convert_yuv_to_rgb_sse(byte3 *rgb, const byte3 * yuv)
+{
+    for (auto i = 0; i < W*H; i++)
+    {
+        convert_single_yuv_to_rgb_sse(yuv[i], &rgb[i]);
+    }
+}
+
+byte3 const yuv[W*H] =
+{
+    {10, 0, 30}, {20, 0, 30},    {70, 60, 90},  {80, 60, 90},    {130, 120, 150}, {140, 120, 150},  {190, 180, 210}, {200, 180, 210},   {250, 240, 270}, {260, 240, 270},   {310, 300, 330}, {320, 300, 330},   {370, 360, 390}, {380, 360, 390},   {430, 420, 450}, {440, 420, 450},
+    {40, 0, 30}, {50, 0, 30},    {100, 60, 90}, {110, 60, 90},   {160, 120, 150}, {170, 120, 150},  {220, 180, 210}, {230, 180, 210},   {280, 240, 270}, {290, 240, 270},   {340, 300, 330}, {350, 300, 330},   {400, 360, 390}, {410, 360, 390},   {460, 420, 450}, {470, 420, 450}
+};
+
+#if defined __SSSE3__
+TEST_CASE("unpack_y411_sse")
+{
+    byte3 dest[DEST_BYTES];
+    byte source[SOURCE_BYTES_Y411];
+
+    for (auto i = 0; i < SOURCE_BYTES_Y411; i++)
+    {
+        source[i] = i*10;
+    }
+
+    librealsense::unpack_y411_sse(&((byte*)dest)[0], source, W, H, SOURCE_BYTES_Y411);
+    byte3 rgb[W*H] = { 0 };
+
+    convert_yuv_to_rgb_sse(&rgb[0], &yuv[0]);
+    for (auto i = 0; i < W*H; i++)
+    {
+        CAPTURE(i);
+        CHECK(dest[i] == rgb[i]);
+    }
+}
+
+#endif
+TEST_CASE("unpack_y411_native")
+{
+    byte3 dest[DEST_BYTES];
+    byte source[SOURCE_BYTES_Y411];
+
+    for (auto i = 0; i < SOURCE_BYTES_Y411; i++)
+    {
+        source[i] = i*10;
+    }
+
+    librealsense::unpack_y411_native(&((byte*)dest)[0], source, W, H, SOURCE_BYTES_Y411);
+
+    byte3 rgb[W*H] = { 0 };
+
+    convert_yuv_to_rgb(&rgb[0], &yuv[0]);
+
+    for (auto i = 0; i < W*H; i++)
+    {
+        CAPTURE(i);
+        CHECK(dest[i] == rgb[i]);
+    }
+}