SIMD_INLINE v128_u8 Average8(const v128_u8 & s00, const v128_u8 & s01, const v128_u8 & s10, const v128_u8 & s11)
 {
     v128_u16 lo = Average16(
         vec_mule(s00, K8_01), vec_mulo(s00, K8_01), 
         vec_mule(s10, K8_01), vec_mulo(s10, K8_01)); 
     v128_u16 hi = Average16(
         vec_mule(s01, K8_01), vec_mulo(s01, K8_01), 
         vec_mule(s11, K8_01), vec_mulo(s11, K8_01)); 
     return vec_pack(lo, hi);
 }
Beispiel #2
0
        template <bool align> SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v)
        {
            __m128i _b16_r16[2][2], _g16_1[2][2];
            Store<align>((__m128i*)y + 0, LoadAndConvertY8<align>((__m128i*)bgra + 0, _b16_r16[0], _g16_1[0]));
            Store<align>((__m128i*)y + 1, LoadAndConvertY8<align>((__m128i*)bgra + 4, _b16_r16[1], _g16_1[1]));

            Average16(_b16_r16);
            Average16(_g16_1);

            Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1])));
            Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1])));
        }
 SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11)
 {
     __m128i lo = Average16(
         _mm_and_si128(s00, K16_00FF), 
         _mm_and_si128(_mm_srli_si128(s00, 1), K16_00FF),                
         _mm_and_si128(s10, K16_00FF), 
         _mm_and_si128(_mm_srli_si128(s10, 1), K16_00FF));
     __m128i hi = Average16(
         _mm_and_si128(s01, K16_00FF), 
         _mm_and_si128(_mm_srli_si128(s01, 1), K16_00FF),                
         _mm_and_si128(s11, K16_00FF), 
         _mm_and_si128(_mm_srli_si128(s11, 1), K16_00FF));
     return _mm_packus_epi16(lo, hi);
 }
Beispiel #4
0
        template <bool align> SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v)
        {
            const uint8_t * bgr1 = bgr0 + bgrStride;
            uint8_t * y1 = y0 + yStride;

            __m256i blue[2][2], green[2][2], red[2][2];

            LoadBgr<align>((__m256i*)bgr0 + 0, blue[0][0], green[0][0], red[0][0]);
            Store<align>((__m256i*)y0 + 0, BgrToY8(blue[0][0], green[0][0], red[0][0]));

            LoadBgr<align>((__m256i*)bgr0 + 3, blue[0][1], green[0][1], red[0][1]);
            Store<align>((__m256i*)y0 + 1, BgrToY8(blue[0][1], green[0][1], red[0][1]));

            LoadBgr<align>((__m256i*)bgr1 + 0, blue[1][0], green[1][0], red[1][0]);
            Store<align>((__m256i*)y1 + 0, BgrToY8(blue[1][0], green[1][0], red[1][0]));

            LoadBgr<align>((__m256i*)bgr1 + 3, blue[1][1], green[1][1], red[1][1]);
            Store<align>((__m256i*)y1 + 1, BgrToY8(blue[1][1], green[1][1], red[1][1]));

            blue[0][0] = Average16(blue[0][0], blue[1][0]);
            blue[0][1] = Average16(blue[0][1], blue[1][1]);
            green[0][0] = Average16(green[0][0], green[1][0]);
            green[0][1] = Average16(green[0][1], green[1][1]);
            red[0][0] = Average16(red[0][0], red[1][0]);
            red[0][1] = Average16(red[0][1], red[1][1]);

            Store<align>((__m256i*)u, PackU16ToU8(BgrToU16(blue[0][0], green[0][0], red[0][0]), BgrToU16(blue[0][1], green[0][1], red[0][1])));
            Store<align>((__m256i*)v, PackU16ToU8(BgrToV16(blue[0][0], green[0][0], red[0][0]), BgrToV16(blue[0][1], green[0][1], red[0][1])));
        }
Beispiel #5
0
        template <bool align> SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v)
        {
            const uint8_t * bgra1 = bgra0 + bgraStride;
            uint8_t * y1 = y0 + yStride;

            __m128i _b16_r16[2][2][2], _g16_1[2][2][2];
            Store<align>((__m128i*)y0 + 0, LoadAndConvertY8<align>((__m128i*)bgra0 + 0, _b16_r16[0][0], _g16_1[0][0]));
            Store<align>((__m128i*)y0 + 1, LoadAndConvertY8<align>((__m128i*)bgra0 + 4, _b16_r16[0][1], _g16_1[0][1]));
            Store<align>((__m128i*)y1 + 0, LoadAndConvertY8<align>((__m128i*)bgra1 + 0, _b16_r16[1][0], _g16_1[1][0]));
            Store<align>((__m128i*)y1 + 1, LoadAndConvertY8<align>((__m128i*)bgra1 + 4, _b16_r16[1][1], _g16_1[1][1]));

            Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]);
            Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]);
            Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]);
            Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]);

            Average16(_g16_1[0][0][0], _g16_1[1][0][0]);
            Average16(_g16_1[0][0][1], _g16_1[1][0][1]);
            Average16(_g16_1[0][1][0], _g16_1[1][1][0]);
            Average16(_g16_1[0][1][1], _g16_1[1][1][1]);

            Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1])));
            Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1])));
        }
Beispiel #6
0
        template <bool align> SIMD_INLINE void BgrToYuv422p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v)
        {
            __m128i blue[2], green[2], red[2];

            LoadBgr<align>((__m128i*)bgr + 0, blue[0], green[0], red[0]);
            Store<align>((__m128i*)y + 0, BgrToY8(blue[0], green[0], red[0]));

            LoadBgr<align>((__m128i*)bgr + 3, blue[1], green[1], red[1]);
            Store<align>((__m128i*)y + 1, BgrToY8(blue[1], green[1], red[1]));

            Average16(blue[0]);
            Average16(blue[1]);
            Average16(green[0]);
            Average16(green[1]);
            Average16(red[0]);
            Average16(red[1]);

            Store<align>((__m128i*)u, _mm_packus_epi16(BgrToU16(blue[0], green[0], red[0]), BgrToU16(blue[1], green[1], red[1])));
            Store<align>((__m128i*)v, _mm_packus_epi16(BgrToV16(blue[0], green[0], red[0]), BgrToV16(blue[1], green[1], red[1])));
        }