SIMD_INLINE v128_u8 Average8(const v128_u8 & s00, const v128_u8 & s01, const v128_u8 & s10, const v128_u8 & s11) { v128_u16 lo = Average16( vec_mule(s00, K8_01), vec_mulo(s00, K8_01), vec_mule(s10, K8_01), vec_mulo(s10, K8_01)); v128_u16 hi = Average16( vec_mule(s01, K8_01), vec_mulo(s01, K8_01), vec_mule(s11, K8_01), vec_mulo(s11, K8_01)); return vec_pack(lo, hi); }
template <bool align> SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) { __m128i _b16_r16[2][2], _g16_1[2][2]; Store<align>((__m128i*)y + 0, LoadAndConvertY8<align>((__m128i*)bgra + 0, _b16_r16[0], _g16_1[0])); Store<align>((__m128i*)y + 1, LoadAndConvertY8<align>((__m128i*)bgra + 4, _b16_r16[1], _g16_1[1])); Average16(_b16_r16); Average16(_g16_1); Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1]))); Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1]))); }
SIMD_INLINE __m128i Average8(const __m128i & s00, const __m128i & s01, const __m128i & s10, const __m128i & s11) { __m128i lo = Average16( _mm_and_si128(s00, K16_00FF), _mm_and_si128(_mm_srli_si128(s00, 1), K16_00FF), _mm_and_si128(s10, K16_00FF), _mm_and_si128(_mm_srli_si128(s10, 1), K16_00FF)); __m128i hi = Average16( _mm_and_si128(s01, K16_00FF), _mm_and_si128(_mm_srli_si128(s01, 1), K16_00FF), _mm_and_si128(s11, K16_00FF), _mm_and_si128(_mm_srli_si128(s11, 1), K16_00FF)); return _mm_packus_epi16(lo, hi); }
template <bool align> SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) { const uint8_t * bgr1 = bgr0 + bgrStride; uint8_t * y1 = y0 + yStride; __m256i blue[2][2], green[2][2], red[2][2]; LoadBgr<align>((__m256i*)bgr0 + 0, blue[0][0], green[0][0], red[0][0]); Store<align>((__m256i*)y0 + 0, BgrToY8(blue[0][0], green[0][0], red[0][0])); LoadBgr<align>((__m256i*)bgr0 + 3, blue[0][1], green[0][1], red[0][1]); Store<align>((__m256i*)y0 + 1, BgrToY8(blue[0][1], green[0][1], red[0][1])); LoadBgr<align>((__m256i*)bgr1 + 0, blue[1][0], green[1][0], red[1][0]); Store<align>((__m256i*)y1 + 0, BgrToY8(blue[1][0], green[1][0], red[1][0])); LoadBgr<align>((__m256i*)bgr1 + 3, blue[1][1], green[1][1], red[1][1]); Store<align>((__m256i*)y1 + 1, BgrToY8(blue[1][1], green[1][1], red[1][1])); blue[0][0] = Average16(blue[0][0], blue[1][0]); blue[0][1] = Average16(blue[0][1], blue[1][1]); green[0][0] = Average16(green[0][0], green[1][0]); green[0][1] = Average16(green[0][1], green[1][1]); red[0][0] = Average16(red[0][0], red[1][0]); red[0][1] = Average16(red[0][1], red[1][1]); Store<align>((__m256i*)u, PackU16ToU8(BgrToU16(blue[0][0], green[0][0], red[0][0]), BgrToU16(blue[0][1], green[0][1], red[0][1]))); Store<align>((__m256i*)v, PackU16ToU8(BgrToV16(blue[0][0], green[0][0], red[0][0]), BgrToV16(blue[0][1], green[0][1], red[0][1]))); }
template <bool align> SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) { const uint8_t * bgra1 = bgra0 + bgraStride; uint8_t * y1 = y0 + yStride; __m128i _b16_r16[2][2][2], _g16_1[2][2][2]; Store<align>((__m128i*)y0 + 0, LoadAndConvertY8<align>((__m128i*)bgra0 + 0, _b16_r16[0][0], _g16_1[0][0])); Store<align>((__m128i*)y0 + 1, LoadAndConvertY8<align>((__m128i*)bgra0 + 4, _b16_r16[0][1], _g16_1[0][1])); Store<align>((__m128i*)y1 + 0, LoadAndConvertY8<align>((__m128i*)bgra1 + 0, _b16_r16[1][0], _g16_1[1][0])); Store<align>((__m128i*)y1 + 1, LoadAndConvertY8<align>((__m128i*)bgra1 + 4, _b16_r16[1][1], _g16_1[1][1])); Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); Average16(_g16_1[0][0][0], _g16_1[1][0][0]); Average16(_g16_1[0][0][1], _g16_1[1][0][1]); Average16(_g16_1[0][1][0], _g16_1[1][1][0]); Average16(_g16_1[0][1][1], _g16_1[1][1][1]); Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1]))); Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1]))); }
template <bool align> SIMD_INLINE void BgrToYuv422p(const uint8_t * bgr, uint8_t * y, uint8_t * u, uint8_t * v) { __m128i blue[2], green[2], red[2]; LoadBgr<align>((__m128i*)bgr + 0, blue[0], green[0], red[0]); Store<align>((__m128i*)y + 0, BgrToY8(blue[0], green[0], red[0])); LoadBgr<align>((__m128i*)bgr + 3, blue[1], green[1], red[1]); Store<align>((__m128i*)y + 1, BgrToY8(blue[1], green[1], red[1])); Average16(blue[0]); Average16(blue[1]); Average16(green[0]); Average16(green[1]); Average16(red[0]); Average16(red[1]); Store<align>((__m128i*)u, _mm_packus_epi16(BgrToU16(blue[0], green[0], red[0]), BgrToU16(blue[1], green[1], red[1]))); Store<align>((__m128i*)v, _mm_packus_epi16(BgrToV16(blue[0], green[0], red[0]), BgrToV16(blue[1], green[1], red[1]))); }