template <bool align> SIMD_INLINE void BgrToYuv420p(const uint8_t * bgr0, size_t bgrStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) { const uint8_t * bgr1 = bgr0 + bgrStride; uint8_t * y1 = y0 + yStride; __m256i blue[2][2], green[2][2], red[2][2]; LoadBgr<align>((__m256i*)bgr0 + 0, blue[0][0], green[0][0], red[0][0]); Store<align>((__m256i*)y0 + 0, BgrToY8(blue[0][0], green[0][0], red[0][0])); LoadBgr<align>((__m256i*)bgr0 + 3, blue[0][1], green[0][1], red[0][1]); Store<align>((__m256i*)y0 + 1, BgrToY8(blue[0][1], green[0][1], red[0][1])); LoadBgr<align>((__m256i*)bgr1 + 0, blue[1][0], green[1][0], red[1][0]); Store<align>((__m256i*)y1 + 0, BgrToY8(blue[1][0], green[1][0], red[1][0])); LoadBgr<align>((__m256i*)bgr1 + 3, blue[1][1], green[1][1], red[1][1]); Store<align>((__m256i*)y1 + 1, BgrToY8(blue[1][1], green[1][1], red[1][1])); blue[0][0] = Average16(blue[0][0], blue[1][0]); blue[0][1] = Average16(blue[0][1], blue[1][1]); green[0][0] = Average16(green[0][0], green[1][0]); green[0][1] = Average16(green[0][1], green[1][1]); red[0][0] = Average16(red[0][0], red[1][0]); red[0][1] = Average16(red[0][1], red[1][1]); Store<align>((__m256i*)u, PackU16ToU8(BgrToU16(blue[0][0], green[0][0], red[0][0]), BgrToU16(blue[0][1], green[0][1], red[0][1]))); Store<align>((__m256i*)v, PackU16ToU8(BgrToV16(blue[0][0], green[0][0], red[0][0]), BgrToV16(blue[0][1], green[0][1], red[0][1]))); }
template <bool align, bool compensation> SIMD_INLINE void MainRowX5x5(Buffer & buffer, size_t offset, uint8_t * dst) { __m256i lo = MainRowX5x5<align, compensation>(buffer, offset); __m256i hi = MainRowX5x5<align, compensation>(buffer, offset + A); Store<false>((__m256i*)dst, PackU16ToU8(lo, hi)); }
template <bool align, bool compensation> SIMD_INLINE __m256i MainRowX5x5(Buffer & buffer, size_t offset) { const __m256i lo = MainRowX5x5<align, compensation>(buffer.dst + offset); const __m256i hi = MainRowX5x5<align, compensation>(buffer.dst + offset + HA); return _mm256_and_si256(PackU16ToU8(lo, hi), K16_00FF); }
template<bool align> SIMD_INLINE void InterpolateY(const uint8_t * bx0, const uint8_t * bx1, __m256i alpha[2], uint8_t * dst) { __m256i lo = InterpolateY<align>((__m256i*)bx0 + 0, (__m256i*)bx1 + 0, alpha); __m256i hi = InterpolateY<align>((__m256i*)bx0 + 1, (__m256i*)bx1 + 1, alpha); Store<false>((__m256i*)dst, PackU16ToU8(lo, hi)); }
SIMD_INLINE __m256i BgraToGray(__m256i bgra[4]) { const __m256i lo = PackI32ToI16(BgraToGray32(bgra[0]), BgraToGray32(bgra[1])); const __m256i hi = PackI32ToI16(BgraToGray32(bgra[2]), BgraToGray32(bgra[3])); return PackU16ToU8(lo, hi); }
template <bool align> SIMD_INLINE __m256i ReduceRow8(const Buffer & buffer, size_t offset) { __m256i lo = ReduceRow16<align>(buffer, offset); __m256i hi = ReduceRow16<align>(buffer, offset + HA); return PackU16ToU8(lo, hi); }