From 44dbb84267cffad437bb141fb11c07b319ee6298 Mon Sep 17 00:00:00 2001 From: aangerma Date: Sun, 25 Apr 2021 12:53:23 +0300 Subject: [PATCH] Added support for I411 on SSE. --- src/proc/color-formats-converter.cpp | 244 +++++++++++++++++++++- src/proc/color-formats-converter.h | 3 + src/sensor.cpp | 6 + unit-tests/algo/convertions/test-y411.cpp | 142 +++++++++++++ 4 files changed, 386 insertions(+), 9 deletions(-) create mode 100644 unit-tests/algo/convertions/test-y411.cpp diff --git a/src/proc/color-formats-converter.cpp b/src/proc/color-formats-converter.cpp index 771f720be73..9817081cf86 100644 --- a/src/proc/color-formats-converter.cpp +++ b/src/proc/color-formats-converter.cpp @@ -59,6 +59,7 @@ namespace librealsense rgb[2] = clamp( ( 298 * c + 516 * d + 128 ) >> 8 ); #undef clamp } + // The bytes alignment on y411: // Y is luminance and U,V are chrome // each U,V are duplicated for 4 pixels @@ -73,13 +74,230 @@ namespace librealsense // [L1-Y3 U1 V1] [L1-Y2 U1 V1] [L1-Y1 U0 V0] [L1-Y0 U0 V0] // [L1-Y7 U3 V3] [L1-Y6 U3 V3] [L0-Y5 U2 V2] [L0-Y4 U2 V2] // [L1-Y7 U3 V3] [L1-Y6 U3 V3] [L1-Y5 U2 V2] [L1-Y4 U2 V2] - void unpack_y411( byte * const dest[], const byte * s, int w, int h, int actual_size ) + // + //https://www.fourcc.org/pixel-format/yuv-y411/ + + void inline unpack_y411_sse(byte * const dest, const byte * s, int w, int h, int actual_size) + { + auto n = w * h; + // working each iteration on 8 y411 pixels, and extract 4 rgb pixels from each one + // so we get 32 rgb pixels + assert(n % 32 == 0); // All currently supported color resolutions are multiples of 32 pixels. Could easily extend support to other resolutions by copying final n<32 pixels into a zero-padded buffer and recursively calling self for final iteration. + + auto src = reinterpret_cast(s); + auto dst = reinterpret_cast<__m128i *>(dest); + + const __m128i zero = _mm_set1_epi8(0); + const __m128i n100 = _mm_set1_epi16(100 << 4); + const __m128i n208 = _mm_set1_epi16(208 << 4); + const __m128i n298 = _mm_set1_epi16(298 << 4); + const __m128i n409 = _mm_set1_epi16(409 << 4); + const __m128i n516 = _mm_set1_epi16(516 << 4); + + // shuffle to y,u,v of pixels 1-2 + const __m128i shuffle_y_1_2_0 = _mm_setr_epi8(1, 2, 4, 5, 7, 8, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0); // to get yyyyyyyy00000000 + const __m128i shuffle_u_1_2_0 = _mm_setr_epi8(0, 0, 0, 0, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0); // to get uuuuuuuu00000000 + const __m128i shuffle_v_1_2_0 = _mm_setr_epi8(3, 3, 3, 3, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0); // to get vvvvvvvv00000000 + + // shuffle to y,u,v of pixels 3-4 - combination of registers 0 and 1 + const __m128i shuffle_y_3_4_0 = _mm_setr_epi8(13, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // to get yy00000000000000 + const __m128i mask_y_3_4_0 = _mm_setr_epi8(-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // to zero the other bytes + const __m128i shuffle_u_3_4_0 = _mm_setr_epi8(12, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // to get uuuu000000000000 + const __m128i shuffle_v_3_4_0 = _mm_setr_epi8(15, 15, 15, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // to get vvvv000000000000 + const __m128i shuffle_y_3_4_1 = _mm_setr_epi8(0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0); // to get 00yyyyyy00000000 + const __m128i mask_y_3_4_1 = _mm_setr_epi8(0, 0, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i shuffle_u_3_4_1 = _mm_setr_epi8(0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0); // to get 0000uuuu00000000 + const __m128i shuffle_v_3_4_1 = _mm_setr_epi8(0, 0, 0, 0, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0); // to get 0000vvvv00000000 + + // shuffle to y,u,v of pixels 5-6- combination of registers 1 and 2 + const __m128i shuffle_y_5_6_1 = _mm_setr_epi8(9, 10, 12, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // to get yyyyy00000000000 + const __m128i mask_y_5_6_1 = _mm_setr_epi8(-1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i shuffle_u_5_6_1 = _mm_setr_epi8(8, 8, 8, 8, 14, 14, 14, 14, 0, 0, 0, 0, 0, 0, 0, 0); // to get uuuuuuuu00000000 + const __m128i shuffle_v_5_6_1 = _mm_setr_epi8(11, 11, 11, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); // to get vvvv000000000000 + const __m128i shuffle_y_5_6_2 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0); // to get 00000yyy00000000 + const __m128i mask_y_5_6_2 = _mm_setr_epi8(0, 0, 0, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i shuffle_v_5_6_2 = _mm_setr_epi8(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0); // to get 0000vvvv00000000 + + // shuffle to y,u,v of pixels 7-8 + const __m128i shuffle_y_7_8_2 = _mm_setr_epi8(5, 6, 8, 9, 11, 12, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0); // to get yyyyyyyy00000000 + const __m128i shuffle_u_7_8_2 = _mm_setr_epi8(4, 4, 4, 4, 10, 10, 10, 10, 0, 0, 0, 0, 0, 0, 0, 0); // to get uuuuuuuu00000000 + const __m128i shuffle_v_7_8_2 = _mm_setr_epi8(7, 7, 7, 7, 13, 13, 13, 13, 0, 0, 0, 0, 0, 0, 0, 0); // to get vvvvvvvv00000000 + + const __m128i mask_uv_0 = _mm_setr_epi8(-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + const __m128i mask_uv_1 = _mm_setr_epi8(0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0); + +//#pragma omp parallel for + for (int i = 0; i < n / 32; i++) + { + // Load 8 y411 pixels into 3 16-byte registers + __m128i s0 = _mm_loadu_si128(&src[i * 3]); + __m128i s1 = _mm_loadu_si128(&src[i * 3 + 1]); + __m128i s2 = _mm_loadu_si128(&src[i * 3 + 2]); + + // pixels 1-2 + __m128i pixel_y_1_2 = _mm_shuffle_epi8(s0, shuffle_y_1_2_0); + __m128i pixel_u_1_2 = _mm_shuffle_epi8(s0, shuffle_u_1_2_0); + __m128i pixel_v_1_2 = _mm_shuffle_epi8(s0, shuffle_v_1_2_0); + + // pixels 3-4 + __m128i pixel_y_3_4_register_0 = _mm_shuffle_epi8(s0, shuffle_y_3_4_0); + __m128i pixel_y_3_4_register_1 = _mm_shuffle_epi8(s1, shuffle_y_3_4_1); + pixel_y_3_4_register_0 = _mm_and_si128(pixel_y_3_4_register_0, mask_y_3_4_0); + pixel_y_3_4_register_1 = _mm_and_si128(pixel_y_3_4_register_1, mask_y_3_4_1); + __m128i pixel_y_3_4 = _mm_or_si128(pixel_y_3_4_register_0, pixel_y_3_4_register_1); + + __m128i pixel_u_3_4_register_0 = _mm_shuffle_epi8(s0, shuffle_u_3_4_0); + __m128i pixel_u_3_4_register_1 = _mm_shuffle_epi8(s1, shuffle_u_3_4_1); + pixel_u_3_4_register_0 = _mm_and_si128(pixel_u_3_4_register_0, mask_uv_0); + pixel_u_3_4_register_1 = _mm_and_si128(pixel_u_3_4_register_1, mask_uv_1); + __m128i pixel_u_3_4 = _mm_or_si128(pixel_u_3_4_register_0, pixel_u_3_4_register_1); + + __m128i pixel_v_3_4_register_0 = _mm_shuffle_epi8(s0, shuffle_v_3_4_0); + __m128i pixel_v_3_4_register_1 = _mm_shuffle_epi8(s1, shuffle_v_3_4_1); + pixel_v_3_4_register_0 = _mm_and_si128(pixel_v_3_4_register_0, mask_uv_0); + pixel_v_3_4_register_1 = _mm_and_si128(pixel_v_3_4_register_1, mask_uv_1); + __m128i pixel_v_3_4 = _mm_or_si128(pixel_v_3_4_register_0, pixel_v_3_4_register_1); + + // pixels 5-6 + __m128i pixel_y_5_6_register_1 = _mm_shuffle_epi8(s1, shuffle_y_5_6_1); + __m128i pixel_y_5_6_register_2 = _mm_shuffle_epi8(s2, shuffle_y_5_6_2); + pixel_y_5_6_register_1 = _mm_and_si128(pixel_y_5_6_register_1, mask_y_5_6_1); + pixel_y_5_6_register_2 = _mm_and_si128(pixel_y_5_6_register_2, mask_y_5_6_2); + __m128i pixel_y_5_6 = _mm_or_si128(pixel_y_5_6_register_1, pixel_y_5_6_register_2); + + __m128i pixel_u_5_6_register_1 = _mm_shuffle_epi8(s1, shuffle_u_5_6_1); + __m128i mask_uv = _mm_or_si128(mask_uv_0, mask_uv_1); + __m128i pixel_u_5_6 = _mm_and_si128(pixel_u_5_6_register_1, mask_uv); + + __m128i pixel_v_5_6_register_1 = _mm_shuffle_epi8(s1, shuffle_v_5_6_1); + __m128i pixel_v_5_6_register_2 = _mm_shuffle_epi8(s2, shuffle_v_5_6_2); + pixel_v_5_6_register_1 = _mm_and_si128(pixel_v_5_6_register_1, mask_uv_0); + pixel_v_5_6_register_2 = _mm_and_si128(pixel_v_5_6_register_2, mask_uv_1); + __m128i pixel_v_5_6 = _mm_or_si128(pixel_v_5_6_register_1, pixel_v_5_6_register_2); + + // pixels 7-8 + __m128i pixel_y_7_8 = _mm_shuffle_epi8(s2, shuffle_y_7_8_2); + __m128i pixel_u_7_8 = _mm_shuffle_epi8(s2, shuffle_u_7_8_2); + __m128i pixel_v_7_8 = _mm_shuffle_epi8(s2, shuffle_v_7_8_2); + + // Retrieve all 32 Y components as 16-bit values (8 components per register)) + // Retrieve all 8 u components as 16-bit values (2 components per register)) + // Retrieve all 8 v components as 16-bit values (2 components per register)) + __m128i y16_pix_1_2 = _mm_unpacklo_epi8(pixel_y_1_2, zero); // convert to 16 bit + __m128i u16_pix_1_2 = _mm_unpacklo_epi8(pixel_u_1_2, zero); // convert to 16 bit + __m128i v16_pix_1_2 = _mm_unpacklo_epi8(pixel_v_1_2, zero); + + __m128i y16_pix_3_4 = _mm_unpacklo_epi8(pixel_y_3_4, zero); // convert to 16 bit + __m128i u16_pix_3_4 = _mm_unpacklo_epi8(pixel_u_3_4, zero); // convert to 16 bit + __m128i v16_pix_3_4 = _mm_unpacklo_epi8(pixel_v_3_4, zero); + + __m128i y16_pix_5_6 = _mm_unpacklo_epi8(pixel_y_5_6, zero); // convert to 16 bit + __m128i u16_pix_5_6 = _mm_unpacklo_epi8(pixel_u_5_6, zero); // convert to 16 bit + __m128i v16_pix_5_6 = _mm_unpacklo_epi8(pixel_v_5_6, zero); + + __m128i y16_pix_7_8 = _mm_unpacklo_epi8(pixel_y_7_8, zero); // convert to 16 bit + __m128i u16_pix_7_8 = _mm_unpacklo_epi8(pixel_u_7_8, zero); // convert to 16 bit + __m128i v16_pix_7_8 = _mm_unpacklo_epi8(pixel_v_7_8, zero); + + // r,g,b + __m128i c16_pix_1_2 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_1_2, _mm_set1_epi16(16)), 4); + __m128i d16_pix_1_2 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_1_2, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication + __m128i e16_pix_1_2 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_1_2, _mm_set1_epi16(128)), 4); + __m128i r16_pix_1_2 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_1_2, n298), _mm_mulhi_epi16(e16_pix_1_2, n409)))))); // (298 * c + 409 * e + 128) ; // + __m128i g16_pix_1_2 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_1_2, n298), _mm_mulhi_epi16(d16_pix_1_2, n100)), _mm_mulhi_epi16(e16_pix_1_2, n208)))))); // (298 * c - 100 * d - 208 * e + 128) + __m128i b16_pix_1_2 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_1_2, n298), _mm_mulhi_epi16(d16_pix_1_2, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8); + + __m128i c16_pix_3_4 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_3_4, _mm_set1_epi16(16)), 4); + __m128i d16_pix_3_4 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_3_4, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication + __m128i e16_pix_3_4 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_3_4, _mm_set1_epi16(128)), 4); + __m128i r16_pix_3_4 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_3_4, n298), _mm_mulhi_epi16(e16_pix_3_4, n409)))))); // (298 * c + 409 * e + 128) ; // + __m128i g16_pix_3_4 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_3_4, n298), _mm_mulhi_epi16(d16_pix_3_4, n100)), _mm_mulhi_epi16(e16_pix_3_4, n208)))))); // (298 * c - 100 * d - 208 * e + 128) + __m128i b16_pix_3_4 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_3_4, n298), _mm_mulhi_epi16(d16_pix_3_4, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8); + + __m128i c16_pix_5_6 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_5_6, _mm_set1_epi16(16)), 4); + __m128i d16_pix_5_6 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_5_6, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication + __m128i e16_pix_5_6 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_5_6, _mm_set1_epi16(128)), 4); + __m128i r16_pix_5_6 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_5_6, n298), _mm_mulhi_epi16(e16_pix_5_6, n409)))))); // (298 * c + 409 * e + 128) ; // + __m128i g16_pix_5_6 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_5_6, n298), _mm_mulhi_epi16(d16_pix_5_6, n100)), _mm_mulhi_epi16(e16_pix_5_6, n208)))))); // (298 * c - 100 * d - 208 * e + 128) + __m128i b16_pix_5_6 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_5_6, n298), _mm_mulhi_epi16(d16_pix_5_6, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8); + + __m128i c16_pix_7_8 = _mm_slli_epi16(_mm_subs_epi16(y16_pix_7_8, _mm_set1_epi16(16)), 4); + __m128i d16_pix_7_8 = _mm_slli_epi16(_mm_subs_epi16(u16_pix_7_8, _mm_set1_epi16(128)), 4); // perhaps could have done these u,v to d,e before the duplication + __m128i e16_pix_7_8 = _mm_slli_epi16(_mm_subs_epi16(v16_pix_7_8, _mm_set1_epi16(128)), 4); + __m128i r16_pix_7_8 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_7_8, n298), _mm_mulhi_epi16(e16_pix_7_8, n409)))))); // (298 * c + 409 * e + 128) ; // + __m128i g16_pix_7_8 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_sub_epi16(_mm_sub_epi16(_mm_mulhi_epi16(c16_pix_7_8, n298), _mm_mulhi_epi16(d16_pix_7_8, n100)), _mm_mulhi_epi16(e16_pix_7_8, n208)))))); // (298 * c - 100 * d - 208 * e + 128) + __m128i b16_pix_7_8 = _mm_min_epi16(_mm_set1_epi16(255), _mm_max_epi16(zero, ((_mm_add_epi16(_mm_mulhi_epi16(c16_pix_7_8, n298), _mm_mulhi_epi16(d16_pix_7_8, n516)))))); // clampbyte((298 * c + 516 * d + 128) >> 8); + + // Shuffle separate R, G, B values into four registers storing four pixels each in (R, G, B, A) order + const __m128i evens_odds = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); + + __m128i rg8_pix_1_2 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_1_2, evens_odds), _mm_shuffle_epi8(g16_pix_1_2, evens_odds)); // hi to take the odds which are the upper bytes we care about + __m128i ba8_pix_1_2 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_1_2, evens_odds), _mm_set1_epi8(-1)); + __m128i rg8_pix_3_4 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_3_4, evens_odds), _mm_shuffle_epi8(g16_pix_3_4, evens_odds)); // hi to take the odds which are the upper bytes we care about + __m128i ba8_pix_3_4 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_3_4, evens_odds), _mm_set1_epi8(-1)); + __m128i rg8_pix_5_6 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_5_6, evens_odds), _mm_shuffle_epi8(g16_pix_5_6, evens_odds)); // hi to take the odds which are the upper bytes we care about + __m128i ba8_pix_5_6 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_5_6, evens_odds), _mm_set1_epi8(-1)); + __m128i rg8_pix_7_8 = _mm_unpacklo_epi8(_mm_shuffle_epi8(r16_pix_7_8, evens_odds), _mm_shuffle_epi8(g16_pix_7_8, evens_odds)); // hi to take the odds which are the upper bytes we care about + __m128i ba8_pix_7_8 = _mm_unpacklo_epi8(_mm_shuffle_epi8(b16_pix_7_8, evens_odds), _mm_set1_epi8(-1)); + + __m128i rgba_0_3 = _mm_unpacklo_epi16(rg8_pix_1_2, ba8_pix_1_2); + __m128i rgba_4_7 = _mm_unpackhi_epi16(rg8_pix_1_2, ba8_pix_1_2); + __m128i rgba_8_11 = _mm_unpacklo_epi16(rg8_pix_3_4, ba8_pix_3_4); + __m128i rgba_12_15 = _mm_unpackhi_epi16(rg8_pix_3_4, ba8_pix_3_4); + __m128i rgba_16_19 = _mm_unpacklo_epi16(rg8_pix_5_6, ba8_pix_5_6); + __m128i rgba_20_23 = _mm_unpackhi_epi16(rg8_pix_5_6, ba8_pix_5_6); + __m128i rgba_24_27 = _mm_unpacklo_epi16(rg8_pix_7_8, ba8_pix_7_8); + __m128i rgba_28_32 = _mm_unpackhi_epi16(rg8_pix_7_8, ba8_pix_7_8); + + // Shuffle rgb triples to the start and end of each register + __m128i rgba_0_7_l0 = _mm_unpacklo_epi64(rgba_0_3, rgba_4_7); + __m128i rgba_0_7_l1 = _mm_unpackhi_epi64(rgba_0_3, rgba_4_7); + __m128i rgba_8_15_l0 = _mm_unpacklo_epi64(rgba_8_11, rgba_12_15); + __m128i rgba_8_15_l1 = _mm_unpackhi_epi64(rgba_8_11, rgba_12_15); + __m128i rgba_16_23_l0 = _mm_unpacklo_epi64(rgba_16_19, rgba_20_23); + __m128i rgba_16_23_l1 = _mm_unpackhi_epi64(rgba_16_19, rgba_20_23); + __m128i rgba_24_32_l0 = _mm_unpacklo_epi64(rgba_24_27, rgba_28_32); + __m128i rgba_24_32_l1 = _mm_unpackhi_epi64(rgba_24_27, rgba_28_32); + + // Shuffle rgb triples to the start and end of each register + __m128i rgb0_l0 = _mm_shuffle_epi8(rgba_0_7_l0, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14)); + __m128i rgb1_l0 = _mm_shuffle_epi8(rgba_8_15_l0, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14)); + __m128i rgb2_l0 = _mm_shuffle_epi8(rgba_16_23_l0, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14)); + __m128i rgb3_l0 = _mm_shuffle_epi8(rgba_24_32_l0, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); + + + // calculate the current line and column + auto num_on_regs_at_once = 3; + auto rgb_bpp = 3; + auto reg_num_on_line = w * rgb_bpp / 16; + auto line = (i*num_on_regs_at_once) / reg_num_on_line; + auto j = i % (reg_num_on_line / num_on_regs_at_once); + + // Align registers and store 16 pixels (48 bytes) at once on the line above + _mm_storeu_si128(&dst[(line*2 ) *reg_num_on_line + j * 3], _mm_alignr_epi8(rgb1_l0, rgb0_l0, 4)); + _mm_storeu_si128(&dst[(line*2 ) * reg_num_on_line + j * 3 + 1], _mm_alignr_epi8(rgb2_l0, rgb1_l0, 8)); + _mm_storeu_si128(&dst[(line*2 ) * reg_num_on_line + j * 3 + 2], _mm_alignr_epi8(rgb3_l0, rgb2_l0, 12)); + + // Shuffle rgb triples to the start and end of each register + __m128i rgb0_l1 = _mm_shuffle_epi8(rgba_0_7_l1, _mm_setr_epi8(3, 7, 11, 15, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14)); + __m128i rgb1_l1 = _mm_shuffle_epi8(rgba_8_15_l1, _mm_setr_epi8(0, 1, 2, 4, 3, 7, 11, 15, 5, 6, 8, 9, 10, 12, 13, 14)); + __m128i rgb2_l1 = _mm_shuffle_epi8(rgba_16_23_l1, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 3, 7, 11, 15, 10, 12, 13, 14)); + __m128i rgb3_l1 = _mm_shuffle_epi8(rgba_24_32_l1, _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15)); + + // Align registers and store 16 pixels(48 bytes) at once on the line bellow + _mm_storeu_si128(&dst[(line*2 + 1) *reg_num_on_line + j * 3], _mm_alignr_epi8(rgb1_l1, rgb0_l1, 4)); + _mm_storeu_si128(&dst[(line*2 + 1) * reg_num_on_line + j * 3 + 1], _mm_alignr_epi8(rgb2_l1, rgb1_l1, 8)); + _mm_storeu_si128(&dst[(line*2 + 1) * reg_num_on_line + j * 3 + 2], _mm_alignr_epi8(rgb3_l1, rgb2_l1, 12)); + } + } + void unpack_y411_native(byte * const dest, const byte * s, int w, int h, int actual_size) { - auto out = dest[0]; + auto out = dest; + auto index_source = 0; - for( auto i = 0; i < h; i += 2 ) + for (auto i = 0; i < h; i += 2) { - for( auto j = 0; j < w; j += 2 ) + for (auto j = 0; j < w; j += 2) { auto y411_pix = &s[index_source]; auto l0_u0 = y411_pix[0]; @@ -87,24 +305,32 @@ namespace librealsense auto l0_y1 = y411_pix[2]; auto l0_v0 = y411_pix[3]; auto l1_y0 = y411_pix[4]; - auto l1_y1 = y411_pix[4]; + auto l1_y1 = y411_pix[5]; byte yuv0_0[3] = { l0_y0, l0_u0, l0_v0 }; - convert_yuv_to_rgb( yuv0_0, &out[i * w * 3 + j * 3] ); + convert_yuv_to_rgb(yuv0_0, &out[i * w * 3 + j * 3]); byte yuv0_1[3] = { l0_y1, l0_u0, l0_v0 }; - convert_yuv_to_rgb( yuv0_0, &out[i * w * 3 + j * 3 + 3 * 3] ); + convert_yuv_to_rgb(yuv0_1, &out[i * w * 3 + j * 3 + 3]); byte yuv1_0[3] = { l1_y0, l0_u0, l0_v0 }; - convert_yuv_to_rgb( yuv1_0, &out[( i + 1 ) * w * 3 + j * 3] ); + convert_yuv_to_rgb(yuv1_0, &out[(i + 1) * w * 3 + j * 3]); byte yuv1_1[3] = { l1_y1, l0_u0, l0_v0 }; - convert_yuv_to_rgb( yuv1_0, &out[( i + 1 ) * w * 3 + j * 3 + 3 * 3] ); + convert_yuv_to_rgb(yuv1_1, &out[(i + 1) * w * 3 + j * 3 + 3]); index_source += 6; } } } + void unpack_y411( byte * const dest[], const byte * s, int w, int h, int actual_size ) + { +#if defined __SSSE3__ + unpack_y411_sse(dest[0], s, w, h, actual_size); +#else + unpack_y411_native(dest[0], s, w, h, actual_size); +#endif + } ///////////////////////////// // YUY2 unpacking routines // diff --git a/src/proc/color-formats-converter.h b/src/proc/color-formats-converter.h index 66a5deb0155..7cbb3204c93 100644 --- a/src/proc/color-formats-converter.h +++ b/src/proc/color-formats-converter.h @@ -84,4 +84,7 @@ namespace librealsense int actual_size, int input_size) override; }; + void unpack_y411(byte * const dest[], const byte * s, int w, int h, int actual_size); + void unpack_y411_sse(byte * const dest, const byte * s, int w, int h, int actual_size); + void unpack_y411_native(byte * const dest, const byte * s, int w, int h, int actual_size); } diff --git a/src/sensor.cpp b/src/sensor.cpp index 5d7bdacfab5..dea4b6fdddb 100644 --- a/src/sensor.cpp +++ b/src/sensor.cpp @@ -367,6 +367,12 @@ namespace librealsense else LOG_DEBUG(expected_size << "expected size is smaller than " << sizeof(byte)*fr->data.size() << "actual size"); } + + memcpy((void*)fh->get_frame_data(), fr->data.data(), expected_size); + auto&& video = (video_frame*)fh.frame; + video->assign(width, height, width * bpp / 8, bpp); + video->set_timestamp_domain(timestamp_domain); + fh->set_stream(req_profile_base); } else { diff --git a/unit-tests/algo/convertions/test-y411.cpp b/unit-tests/algo/convertions/test-y411.cpp new file mode 100644 index 00000000000..d723c89c0c4 --- /dev/null +++ b/unit-tests/algo/convertions/test-y411.cpp @@ -0,0 +1,142 @@ +// License: Apache 2.0. See LICENSE file in root directory. +// Copyright(c) 2021 Intel Corporation. All Rights Reserved. + +//#cmake: static! +//#cmake:add-file ../../../src/proc/color-formats-converter.cpp + +#include "../algo-common.h" +#include +#include "../src/types.h" +#include "../src/proc/color-formats-converter.h" + +const int W = 16; +const int H = 2; +const int BPP_RGB = 24; +const int BPP_YUV = 24; +const int BPP_Y11 = 12; +const int SOURCE_BYTES_Y411 = W * H*BPP_Y11 / 8; +const int SOURCE_BYTES_YUV = W * H*BPP_YUV / 8; +const int DEST_BYTES = W * H* BPP_RGB / 8; + +struct byte3 +{ + byte data[3]; + bool operator==(const byte3& other) const + { + return (int)data[0] == (int)other.data[0] && (int)data[1] == (int)other.data[1] && (int)data[2] == (int)other.data[2]; + } +}; + +std::ostream & operator<<(std::ostream & os, const byte3& obj) +{ + os << (int)obj.data[0] << " " << (int)obj.data[1] << " " << (int)obj.data[2]; + return os; +} + +void convert_single_yuv_to_rgb(const byte3 yuv, byte3 *rgb) +{ + int32_t c = yuv.data[0] - 16; + int32_t d = yuv.data[1] - 128; + int32_t e = yuv.data[2] - 128; + + int32_t t; +#define clamp( x ) ( ( t = ( x ) ) > 255 ? 255 : t < 0 ? 0 : t ) + rgb->data[0] = clamp((298 * c + 409 * e + 128) >> 8); + rgb->data[1] = clamp((298 * c - 100 * d - 208 * e + 128) >> 8); + rgb->data[2] = clamp((298 * c + 516 * d + 128) >> 8); +#undef clamp +} + +void convert_yuv_to_rgb(byte3 *rgb, const byte3 * yuv) +{ + for (auto i = 0; i < W*H; i++) + { + convert_single_yuv_to_rgb(yuv[i], &rgb[i]); + } +} + +// to simulate _mm_mulhi_epi16 +int16_t mult_hi(int16_t x, int16_t y) +{ + auto x_i = (int)x; + auto y_i = (int)y; + + auto tmp = x_i * y_i; + + auto res = tmp >> 16; + return tmp >> 16; +} + +void convert_single_yuv_to_rgb_sse(const byte3 yuv, byte3 *rgb) +{ + int16_t c = (yuv.data[0] - 16)<<4; + int16_t d = (yuv.data[1] - 128)<<4; + int16_t e = (yuv.data[2] - 128)<<4; + + int16_t t; +#define clamp( x ) ( ( t = ( x ) ) > 255 ? 255 : t < 0 ? 0 : t ) + rgb->data[0] = clamp(mult_hi((298 << 4) , c) + mult_hi((409<<4) , e) ); + rgb->data[1] = clamp(mult_hi((298<<4) , c) - mult_hi((100<<4), d) - mult_hi((208<<4), e) ); + rgb->data[2] = clamp(mult_hi((298<<4) , c) + mult_hi((516<<4) , d)); +#undef clamp +} + +void convert_yuv_to_rgb_sse(byte3 *rgb, const byte3 * yuv) +{ + for (auto i = 0; i < W*H; i++) + { + convert_single_yuv_to_rgb_sse(yuv[i], &rgb[i]); + } +} + +byte3 const yuv[W*H] = +{ + {10, 0, 30}, {20, 0, 30}, {70, 60, 90}, {80, 60, 90}, {130, 120, 150}, {140, 120, 150}, {190, 180, 210}, {200, 180, 210}, {250, 240, 270}, {260, 240, 270}, {310, 300, 330}, {320, 300, 330}, {370, 360, 390}, {380, 360, 390}, {430, 420, 450}, {440, 420, 450}, + {40, 0, 30}, {50, 0, 30}, {100, 60, 90}, {110, 60, 90}, {160, 120, 150}, {170, 120, 150}, {220, 180, 210}, {230, 180, 210}, {280, 240, 270}, {290, 240, 270}, {340, 300, 330}, {350, 300, 330}, {400, 360, 390}, {410, 360, 390}, {460, 420, 450}, {470, 420, 450} +}; + +#if defined __SSSE3__ +TEST_CASE("unpack_y411_sse") +{ + byte3 dest[DEST_BYTES]; + byte source[SOURCE_BYTES_Y411]; + + for (auto i = 0; i < SOURCE_BYTES_Y411; i++) + { + source[i] = i*10; + } + + librealsense::unpack_y411_sse(&((byte*)dest)[0], source, W, H, SOURCE_BYTES_Y411); + byte3 rgb[W*H] = { 0 }; + + convert_yuv_to_rgb_sse(&rgb[0], &yuv[0]); + for (auto i = 0; i < W*H; i++) + { + CAPTURE(i); + CHECK(dest[i] == rgb[i]); + } +} + +#endif +TEST_CASE("unpack_y411_native") +{ + byte3 dest[DEST_BYTES]; + byte source[SOURCE_BYTES_Y411]; + + for (auto i = 0; i < SOURCE_BYTES_Y411; i++) + { + source[i] = i*10; + } + + librealsense::unpack_y411_native(&((byte*)dest)[0], source, W, H, SOURCE_BYTES_Y411); + + byte3 rgb[W*H] = { 0 }; + + convert_yuv_to_rgb(&rgb[0], &yuv[0]); + + for (auto i = 0; i < W*H; i++) + { + CAPTURE(i); + CHECK(dest[i] == rgb[i]); + } +}