namespace Ssse3 { __m128i K8_SHUFFLE_GR = SIMD_MM_SETR_EPI8(0x1, -1, -1, -1, 0x6, -1, -1, -1, 0x9, -1, -1, -1, 0xE, -1, -1, -1); __m128i K8_SHUFFLE_BG = SIMD_MM_SETR_EPI8(0x0, -1, -1, -1, 0x5, -1, -1, -1, 0x8, -1, -1, -1, 0xD, -1, -1, -1); __m128i K8_SHUFFLE_GB = SIMD_MM_SETR_EPI8(0x1, -1, -1, -1, 0x4, -1, -1, -1, 0x9, -1, -1, -1, 0xC, -1, -1, -1); __m128i K8_SHUFFLE_RG = SIMD_MM_SETR_EPI8(0x2, -1, -1, -1, 0x5, -1, -1, -1, 0xA, -1, -1, -1, 0xD, -1, -1, -1); template <int format, int row, bool align> SIMD_INLINE void BgraToBayer(const uint8_t * bgra, uint8_t * bayer, const __m128i shuffle[4][2]) { const __m128i bayer0 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), shuffle[format][row]); const __m128i bayer1 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 1), shuffle[format][row]); const __m128i bayer2 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 2), shuffle[format][row]); const __m128i bayer3 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 3), shuffle[format][row]); Store<align>((__m128i*)bayer, _mm_packus_epi16(_mm_packs_epi32(bayer0, bayer1), _mm_packs_epi32(bayer2, bayer3))); } template <int format, bool align> void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride) { assert(width >= A); if(align) assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride)); size_t alignedWidth = AlignLo(width, A); const __m128i shuffle[4][2] = { {K8_SHUFFLE_GR, K8_SHUFFLE_BG}, {K8_SHUFFLE_GB, K8_SHUFFLE_RG}, {K8_SHUFFLE_RG, K8_SHUFFLE_GB}, {K8_SHUFFLE_BG, K8_SHUFFLE_GR} }; for(size_t row = 0; row < height; row += 2) { for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA) BgraToBayer<format, 0, align>(bgra + offset, bayer + col, shuffle); if(alignedWidth != width) BgraToBayer<format, 0, false>(bgra + 4*(width - A), bayer + width - A, shuffle); bgra += bgraStride; bayer += bayerStride; for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA) BgraToBayer<format, 1, align>(bgra + offset, bayer + col, shuffle); if(alignedWidth != width) BgraToBayer<format, 1, false>(bgra + 4*(width - A), bayer + width - A, shuffle); bgra += bgraStride; bayer += bayerStride; } } template<bool align> void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) { assert((width%2 == 0) && (height%2 == 0)); switch(bayerFormat) { case SimdPixelFormatBayerGrbg: BgraToBayer<0, align>(bgra, width, height, bgraStride, bayer, bayerStride); break; case SimdPixelFormatBayerGbrg: BgraToBayer<1, align>(bgra, width, height, bgraStride, bayer, bayerStride); break; case SimdPixelFormatBayerRggb: BgraToBayer<2, align>(bgra, width, height, bgraStride, bayer, bayerStride); break; case SimdPixelFormatBayerBggr: BgraToBayer<3, align>(bgra, width, height, bgraStride, bayer, bayerStride); break; default: assert(0); } } void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) { if(Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride)) BgraToBayer<true>(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); else BgraToBayer<false>(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat); } }
namespace Ssse3 { const __m128i K8_SHUFFLE_BGRA_TO_B0R0 = SIMD_MM_SETR_EPI8(0x0, -1, 0x2, -1, 0x4, -1, 0x6, -1, 0x8, -1, 0xA, -1, 0xC, -1, 0xE, -1); const __m128i K8_SHUFFLE_BGRA_TO_G000 = SIMD_MM_SETR_EPI8(0x1, -1, -1, -1, 0x5, -1, -1, -1, 0x9, -1, -1, -1, 0xD, -1, -1, -1); template <bool align> SIMD_INLINE void LoadPreparedBgra16(const __m128i * bgra, __m128i & b16_r16, __m128i & g16_1) { __m128i _bgra = Load<align>(bgra); b16_r16 = _mm_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_B0R0); g16_1 = _mm_or_si128(_mm_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_G000), K32_00010000); } template <bool align> SIMD_INLINE __m128i LoadAndConvertY16(const __m128i * bgra, __m128i & b16_r16, __m128i & g16_1) { __m128i _b16_r16[2], _g16_1[2]; LoadPreparedBgra16<align>(bgra + 0, _b16_r16[0], _g16_1[0]); LoadPreparedBgra16<align>(bgra + 1, _b16_r16[1], _g16_1[1]); b16_r16 = _mm_hadd_epi32(_b16_r16[0], _b16_r16[1]); g16_1 = _mm_hadd_epi32(_g16_1[0], _g16_1[1]); return SaturateI16ToU8(_mm_add_epi16(K16_Y_ADJUST, _mm_packs_epi32(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1])))); } template <bool align> SIMD_INLINE __m128i LoadAndConvertY8(const __m128i * bgra, __m128i b16_r16[2], __m128i g16_1[2]) { return _mm_packus_epi16(LoadAndConvertY16<align>(bgra + 0, b16_r16[0], g16_1[0]), LoadAndConvertY16<align>(bgra + 2, b16_r16[1], g16_1[1])); } SIMD_INLINE void Average16(__m128i & a, const __m128i & b) { a = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(a, b), K16_0002), 2); } SIMD_INLINE __m128i ConvertU16(__m128i b16_r16[2], __m128i g16_1[2]) { return SaturateI16ToU8(_mm_add_epi16(K16_UV_ADJUST, _mm_packs_epi32(BgrToU32(b16_r16[0], g16_1[0]), BgrToU32(b16_r16[1], g16_1[1])))); } SIMD_INLINE __m128i ConvertV16(__m128i b16_r16[2], __m128i g16_1[2]) { return SaturateI16ToU8(_mm_add_epi16(K16_UV_ADJUST, _mm_packs_epi32(BgrToV32(b16_r16[0], g16_1[0]), BgrToV32(b16_r16[1], g16_1[1])))); } template <bool align> SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v) { const uint8_t * bgra1 = bgra0 + bgraStride; uint8_t * y1 = y0 + yStride; __m128i _b16_r16[2][2][2], _g16_1[2][2][2]; Store<align>((__m128i*)y0 + 0, LoadAndConvertY8<align>((__m128i*)bgra0 + 0, _b16_r16[0][0], _g16_1[0][0])); Store<align>((__m128i*)y0 + 1, LoadAndConvertY8<align>((__m128i*)bgra0 + 4, _b16_r16[0][1], _g16_1[0][1])); Store<align>((__m128i*)y1 + 0, LoadAndConvertY8<align>((__m128i*)bgra1 + 0, _b16_r16[1][0], _g16_1[1][0])); Store<align>((__m128i*)y1 + 1, LoadAndConvertY8<align>((__m128i*)bgra1 + 4, _b16_r16[1][1], _g16_1[1][1])); Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]); Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]); Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]); Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]); Average16(_g16_1[0][0][0], _g16_1[1][0][0]); Average16(_g16_1[0][0][1], _g16_1[1][0][1]); Average16(_g16_1[0][1][0], _g16_1[1][1][0]); Average16(_g16_1[0][1][1], _g16_1[1][1][1]); Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1]))); Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1]))); } template <bool align> void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { assert((width%2 == 0) && (height%2 == 0) && (width >= DA) && (height >= 2)); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); } size_t alignedWidth = AlignLo(width, DA); const size_t A8 = A*8; for(size_t row = 0; row < height; row += 2) { for(size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) BgraToYuv420p<align>(bgra + colBgra, bgraStride, y + colY, yStride, u + colUV, v + colUV); if(width != alignedWidth) { size_t offset = width - DA; BgraToYuv420p<false>(bgra + offset*4, bgraStride, y + offset, yStride, u + offset/2, v + offset/2); } y += 2*yStride; u += uStride; v += vStride; bgra += 2*bgraStride; } } void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { if(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) BgraToYuv420p<true>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); else BgraToYuv420p<false>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); } SIMD_INLINE void Average16(__m128i a[2][2]) { a[0][0] = _mm_srli_epi16(_mm_add_epi16(a[0][0], K16_0001), 1); a[0][1] = _mm_srli_epi16(_mm_add_epi16(a[0][1], K16_0001), 1); a[1][0] = _mm_srli_epi16(_mm_add_epi16(a[1][0], K16_0001), 1); a[1][1] = _mm_srli_epi16(_mm_add_epi16(a[1][1], K16_0001), 1); } template <bool align> SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v) { __m128i _b16_r16[2][2], _g16_1[2][2]; Store<align>((__m128i*)y + 0, LoadAndConvertY8<align>((__m128i*)bgra + 0, _b16_r16[0], _g16_1[0])); Store<align>((__m128i*)y + 1, LoadAndConvertY8<align>((__m128i*)bgra + 4, _b16_r16[1], _g16_1[1])); Average16(_b16_r16); Average16(_g16_1); Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1]))); Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1]))); } template <bool align> void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { assert((width%2 == 0) && (width >= DA)); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); } size_t alignedWidth = AlignLo(width, DA); const size_t A8 = A*8; for(size_t row = 0; row < height; ++row) { for(size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) BgraToYuv422p<align>(bgra + colBgra, y + colY, u + colUV, v + colUV); if(width != alignedWidth) { size_t offset = width - DA; BgraToYuv422p<false>(bgra + offset*4, y + offset, u + offset/2, v + offset/2); } y += yStride; u += uStride; v += vStride; bgra += bgraStride; } } void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { if(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)) BgraToYuv422p<true>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); else BgraToYuv422p<false>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride); } }
namespace Ssse3 { __m128i K8_SHUFFLE_GR_0 = SIMD_MM_SETR_EPI8(0x1, 0x5, 0x7, 0xB, 0xD, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_GR_1 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, 0x1, 0x3, 0x7, 0x9, 0xD, 0xF, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_GR_2 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x3, 0x5, 0x9, 0xB, 0xF); __m128i K8_SHUFFLE_BG_0 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x6, 0xA, 0xC, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_BG_1 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, 0x0, 0x2, 0x6, 0x8, 0xC, 0xE, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_BG_2 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x2, 0x4, 0x8, 0xA, 0xE); __m128i K8_SHUFFLE_GB_0 = SIMD_MM_SETR_EPI8(0x1, 0x3, 0x7, 0x9, 0xD, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_GB_1 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, -1, 0x3, 0x5, 0x9, 0xB, 0xF, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_GB_2 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x1, 0x5, 0x7, 0xB, 0xD); __m128i K8_SHUFFLE_RG_0 = SIMD_MM_SETR_EPI8(0x2, 0x4, 0x8, 0xA, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_RG_1 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, 0x0, 0x4, 0x6, 0xA, 0xC, -1, -1, -1, -1, -1, -1); __m128i K8_SHUFFLE_RG_2 = SIMD_MM_SETR_EPI8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x2, 0x6, 0x8, 0xC, 0xE); template <int format, int row, bool align> SIMD_INLINE void BgrToBayer(const uint8_t * bgr, uint8_t * bayer, const __m128i shuffle[4][2][3]) { const __m128i bayer0 = _mm_shuffle_epi8(Load<align>((__m128i*)bgr + 0), shuffle[format][row][0]); const __m128i bayer1 = _mm_shuffle_epi8(Load<align>((__m128i*)bgr + 1), shuffle[format][row][1]); const __m128i bayer2 = _mm_shuffle_epi8(Load<align>((__m128i*)bgr + 2), shuffle[format][row][2]); Store<align>((__m128i*)bayer, _mm_or_si128(_mm_or_si128(bayer0, bayer1), bayer2)); } template <int format, bool align> void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride) { assert(width >= A); if(align) assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride)); size_t alignedWidth = AlignLo(width, A); const __m128i shuffle[4][2][3] = { {{K8_SHUFFLE_GR_0, K8_SHUFFLE_GR_1, K8_SHUFFLE_GR_2}, {K8_SHUFFLE_BG_0, K8_SHUFFLE_BG_1, K8_SHUFFLE_BG_2}}, {{K8_SHUFFLE_GB_0, K8_SHUFFLE_GB_1, K8_SHUFFLE_GB_2}, {K8_SHUFFLE_RG_0, K8_SHUFFLE_RG_1, K8_SHUFFLE_RG_2}}, {{K8_SHUFFLE_RG_0, K8_SHUFFLE_RG_1, K8_SHUFFLE_RG_2}, {K8_SHUFFLE_GB_0, K8_SHUFFLE_GB_1, K8_SHUFFLE_GB_2}}, {{K8_SHUFFLE_BG_0, K8_SHUFFLE_BG_1, K8_SHUFFLE_BG_2}, {K8_SHUFFLE_GR_0, K8_SHUFFLE_GR_1, K8_SHUFFLE_GR_2}} }; for(size_t row = 0; row < height; row += 2) { for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += 3*A) BgrToBayer<format, 0, align>(bgr + offset, bayer + col, shuffle); if(alignedWidth != width) BgrToBayer<format, 0, false>(bgr + 3*(width - A), bayer + width - A, shuffle); bgr += bgrStride; bayer += bayerStride; for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += 3*A) BgrToBayer<format, 1, align>(bgr + offset, bayer + col, shuffle); if(alignedWidth != width) BgrToBayer<format, 1, false>(bgr + 3*(width - A), bayer + width - A, shuffle); bgr += bgrStride; bayer += bayerStride; } } template<bool align> void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) { assert((width%2 == 0) && (height%2 == 0)); switch(bayerFormat) { case SimdPixelFormatBayerGrbg: BgrToBayer<0, align>(bgr, width, height, bgrStride, bayer, bayerStride); break; case SimdPixelFormatBayerGbrg: BgrToBayer<1, align>(bgr, width, height, bgrStride, bayer, bayerStride); break; case SimdPixelFormatBayerRggb: BgrToBayer<2, align>(bgr, width, height, bgrStride, bayer, bayerStride); break; case SimdPixelFormatBayerBggr: BgrToBayer<3, align>(bgr, width, height, bgrStride, bayer, bayerStride); break; default: assert(0); } } void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat) { if(Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride)) BgrToBayer<true>(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); else BgrToBayer<false>(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat); } }