Example #1
0
    namespace Ssse3
    {
        __m128i K8_SHUFFLE_GR = SIMD_MM_SETR_EPI8(0x1,  -1,  -1,  -1, 0x6,  -1,  -1,  -1, 0x9,  -1,  -1,  -1, 0xE,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_BG = SIMD_MM_SETR_EPI8(0x0,  -1,  -1,  -1, 0x5,  -1,  -1,  -1, 0x8,  -1,  -1,  -1, 0xD,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_GB = SIMD_MM_SETR_EPI8(0x1,  -1,  -1,  -1, 0x4,  -1,  -1,  -1, 0x9,  -1,  -1,  -1, 0xC,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_RG = SIMD_MM_SETR_EPI8(0x2,  -1,  -1,  -1, 0x5,  -1,  -1,  -1, 0xA,  -1,  -1,  -1, 0xD,  -1,  -1,  -1);

        template <int format, int row, bool align> 
        SIMD_INLINE void BgraToBayer(const uint8_t * bgra, uint8_t * bayer, const __m128i shuffle[4][2])
        {
            const __m128i bayer0 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 0), shuffle[format][row]);
            const __m128i bayer1 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 1), shuffle[format][row]);
            const __m128i bayer2 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 2), shuffle[format][row]);
            const __m128i bayer3 = _mm_shuffle_epi8(Load<align>((__m128i*)bgra + 3), shuffle[format][row]);
            Store<align>((__m128i*)bayer, _mm_packus_epi16(_mm_packs_epi32(bayer0, bayer1), _mm_packs_epi32(bayer2, bayer3)));
        }

        template <int format, bool align> 
        void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride));

            size_t alignedWidth = AlignLo(width, A);

            const __m128i shuffle[4][2] = 
            {
                {K8_SHUFFLE_GR, K8_SHUFFLE_BG}, 
                {K8_SHUFFLE_GB, K8_SHUFFLE_RG}, 
                {K8_SHUFFLE_RG, K8_SHUFFLE_GB}, 
                {K8_SHUFFLE_BG, K8_SHUFFLE_GR}
            };

            for(size_t row = 0; row < height; row += 2)
            {
                for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA)
                    BgraToBayer<format, 0, align>(bgra + offset, bayer + col, shuffle);
                if(alignedWidth != width)
                    BgraToBayer<format, 0, false>(bgra + 4*(width - A), bayer + width - A, shuffle);
                bgra += bgraStride;
                bayer += bayerStride;

                for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += QA)
                    BgraToBayer<format, 1, align>(bgra + offset, bayer + col, shuffle);
                if(alignedWidth != width)
                    BgraToBayer<format, 1, false>(bgra + 4*(width - A), bayer + width - A, shuffle);
                bgra += bgraStride;
                bayer += bayerStride;
            }        
        }

        template<bool align>
        void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat)
        {
            assert((width%2 == 0) && (height%2 == 0));

            switch(bayerFormat)
            {
            case SimdPixelFormatBayerGrbg: 
                BgraToBayer<0, align>(bgra, width, height, bgraStride, bayer, bayerStride);
                break;
            case SimdPixelFormatBayerGbrg:
                BgraToBayer<1, align>(bgra, width, height, bgraStride, bayer, bayerStride);
                break;
            case SimdPixelFormatBayerRggb:
                BgraToBayer<2, align>(bgra, width, height, bgraStride, bayer, bayerStride);
                break;
            case SimdPixelFormatBayerBggr:
                BgraToBayer<3, align>(bgra, width, height, bgraStride, bayer, bayerStride);
                break;
            default:
                assert(0);
            }        
        }

        void BgraToBayer(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat)
        {
            if(Aligned(bgra) && Aligned(bgraStride) && Aligned(bayer) && Aligned(bayerStride))
                BgraToBayer<true>(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat);
            else
                BgraToBayer<false>(bgra, width, height, bgraStride, bayer, bayerStride, bayerFormat);
        }
    }
Example #2
0
    namespace Ssse3
    {
        const __m128i K8_SHUFFLE_BGRA_TO_B0R0 = SIMD_MM_SETR_EPI8(0x0,  -1, 0x2,  -1, 0x4,  -1, 0x6,  -1, 0x8,  -1, 0xA,  -1, 0xC,  -1, 0xE,  -1);
        const __m128i K8_SHUFFLE_BGRA_TO_G000 = SIMD_MM_SETR_EPI8(0x1,  -1,  -1,  -1, 0x5,  -1,  -1,  -1, 0x9,  -1,  -1,  -1, 0xD,  -1,  -1,  -1);

        template <bool align> SIMD_INLINE void LoadPreparedBgra16(const __m128i * bgra, __m128i & b16_r16, __m128i & g16_1)
        {
            __m128i _bgra = Load<align>(bgra);
            b16_r16 = _mm_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_B0R0);
            g16_1 = _mm_or_si128(_mm_shuffle_epi8(_bgra, K8_SHUFFLE_BGRA_TO_G000), K32_00010000);
        }

        template <bool align> SIMD_INLINE __m128i LoadAndConvertY16(const __m128i * bgra, __m128i & b16_r16, __m128i & g16_1)
        {
            __m128i _b16_r16[2], _g16_1[2];
            LoadPreparedBgra16<align>(bgra + 0, _b16_r16[0], _g16_1[0]);
            LoadPreparedBgra16<align>(bgra + 1, _b16_r16[1], _g16_1[1]);
            b16_r16 = _mm_hadd_epi32(_b16_r16[0], _b16_r16[1]);
            g16_1 = _mm_hadd_epi32(_g16_1[0], _g16_1[1]);
            return SaturateI16ToU8(_mm_add_epi16(K16_Y_ADJUST, _mm_packs_epi32(BgrToY32(_b16_r16[0], _g16_1[0]), BgrToY32(_b16_r16[1], _g16_1[1]))));
        }

        template <bool align> SIMD_INLINE __m128i LoadAndConvertY8(const __m128i * bgra, __m128i b16_r16[2], __m128i g16_1[2])
        {
            return _mm_packus_epi16(LoadAndConvertY16<align>(bgra + 0, b16_r16[0], g16_1[0]), LoadAndConvertY16<align>(bgra + 2, b16_r16[1], g16_1[1]));
        }

        SIMD_INLINE void Average16(__m128i & a, const __m128i & b)
        {
            a = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(a, b), K16_0002), 2);
        }

        SIMD_INLINE __m128i ConvertU16(__m128i b16_r16[2], __m128i g16_1[2])
        {
            return SaturateI16ToU8(_mm_add_epi16(K16_UV_ADJUST, _mm_packs_epi32(BgrToU32(b16_r16[0], g16_1[0]), BgrToU32(b16_r16[1], g16_1[1]))));
        }

        SIMD_INLINE __m128i ConvertV16(__m128i b16_r16[2], __m128i g16_1[2])
        {
            return SaturateI16ToU8(_mm_add_epi16(K16_UV_ADJUST, _mm_packs_epi32(BgrToV32(b16_r16[0], g16_1[0]), BgrToV32(b16_r16[1], g16_1[1]))));
        }

        template <bool align> SIMD_INLINE void BgraToYuv420p(const uint8_t * bgra0, size_t bgraStride, uint8_t * y0, size_t yStride, uint8_t * u, uint8_t * v)
        {
            const uint8_t * bgra1 = bgra0 + bgraStride;
            uint8_t * y1 = y0 + yStride;

            __m128i _b16_r16[2][2][2], _g16_1[2][2][2];
            Store<align>((__m128i*)y0 + 0, LoadAndConvertY8<align>((__m128i*)bgra0 + 0, _b16_r16[0][0], _g16_1[0][0]));
            Store<align>((__m128i*)y0 + 1, LoadAndConvertY8<align>((__m128i*)bgra0 + 4, _b16_r16[0][1], _g16_1[0][1]));
            Store<align>((__m128i*)y1 + 0, LoadAndConvertY8<align>((__m128i*)bgra1 + 0, _b16_r16[1][0], _g16_1[1][0]));
            Store<align>((__m128i*)y1 + 1, LoadAndConvertY8<align>((__m128i*)bgra1 + 4, _b16_r16[1][1], _g16_1[1][1]));

            Average16(_b16_r16[0][0][0], _b16_r16[1][0][0]);
            Average16(_b16_r16[0][0][1], _b16_r16[1][0][1]);
            Average16(_b16_r16[0][1][0], _b16_r16[1][1][0]);
            Average16(_b16_r16[0][1][1], _b16_r16[1][1][1]);

            Average16(_g16_1[0][0][0], _g16_1[1][0][0]);
            Average16(_g16_1[0][0][1], _g16_1[1][0][1]);
            Average16(_g16_1[0][1][0], _g16_1[1][1][0]);
            Average16(_g16_1[0][1][1], _g16_1[1][1][1]);

            Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0][0], _g16_1[0][0]), ConvertU16(_b16_r16[0][1], _g16_1[0][1])));
            Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0][0], _g16_1[0][0]), ConvertV16(_b16_r16[0][1], _g16_1[0][1])));
        }

        template <bool align> void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            assert((width%2 == 0) && (height%2 == 0) && (width >= DA) && (height >= 2));
            if(align)
            {
                assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
                assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride));
            }

            size_t alignedWidth = AlignLo(width, DA);
            const size_t A8 = A*8;
            for(size_t row = 0; row < height; row += 2)
            {
                for(size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8)
                    BgraToYuv420p<align>(bgra + colBgra, bgraStride, y + colY, yStride, u + colUV, v + colUV);
                if(width != alignedWidth)
                {
                    size_t offset = width - DA;
                    BgraToYuv420p<false>(bgra + offset*4, bgraStride, y + offset, yStride, u + offset/2, v + offset/2);
                }
                y += 2*yStride;
                u += uStride;
                v += vStride;
                bgra += 2*bgraStride;
            }
        }

        void BgraToYuv420p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            if(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) 
                && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride))
                BgraToYuv420p<true>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride);
            else
                BgraToYuv420p<false>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride);
        }

        SIMD_INLINE void Average16(__m128i a[2][2])
        {
            a[0][0] = _mm_srli_epi16(_mm_add_epi16(a[0][0], K16_0001), 1);
            a[0][1] = _mm_srli_epi16(_mm_add_epi16(a[0][1], K16_0001), 1);
            a[1][0] = _mm_srli_epi16(_mm_add_epi16(a[1][0], K16_0001), 1);
            a[1][1] = _mm_srli_epi16(_mm_add_epi16(a[1][1], K16_0001), 1);
        }

        template <bool align> SIMD_INLINE void BgraToYuv422p(const uint8_t * bgra, uint8_t * y, uint8_t * u, uint8_t * v)
        {
            __m128i _b16_r16[2][2], _g16_1[2][2];
            Store<align>((__m128i*)y + 0, LoadAndConvertY8<align>((__m128i*)bgra + 0, _b16_r16[0], _g16_1[0]));
            Store<align>((__m128i*)y + 1, LoadAndConvertY8<align>((__m128i*)bgra + 4, _b16_r16[1], _g16_1[1]));

            Average16(_b16_r16);
            Average16(_g16_1);

            Store<align>((__m128i*)u, _mm_packus_epi16(ConvertU16(_b16_r16[0], _g16_1[0]), ConvertU16(_b16_r16[1], _g16_1[1])));
            Store<align>((__m128i*)v, _mm_packus_epi16(ConvertV16(_b16_r16[0], _g16_1[0]), ConvertV16(_b16_r16[1], _g16_1[1])));
        }

        template <bool align> void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            assert((width%2 == 0) && (width >= DA));
            if(align)
            {
                assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
                assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride));
            }

            size_t alignedWidth = AlignLo(width, DA);
            const size_t A8 = A*8;
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8)
                    BgraToYuv422p<align>(bgra + colBgra, y + colY, u + colUV, v + colUV);
                if(width != alignedWidth)
                {
                    size_t offset = width - DA;
                    BgraToYuv422p<false>(bgra + offset*4, y + offset, u + offset/2, v + offset/2);
                }
                y += yStride;
                u += uStride;
                v += vStride;
                bgra += bgraStride;
            }
        }

        void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            if(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride) 
                && Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride))
                BgraToYuv422p<true>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride);
            else
                BgraToYuv422p<false>(bgra, width, height, bgraStride, y, yStride, u, uStride, v, vStride);
        }
    }
    namespace Ssse3
    {
        __m128i K8_SHUFFLE_GR_0 = SIMD_MM_SETR_EPI8(0x1, 0x5, 0x7, 0xB, 0xD,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_GR_1 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1, 0x1, 0x3, 0x7, 0x9, 0xD, 0xF,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_GR_2 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, 0x3, 0x5, 0x9, 0xB, 0xF);

        __m128i K8_SHUFFLE_BG_0 = SIMD_MM_SETR_EPI8(0x0, 0x4, 0x6, 0xA, 0xC,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_BG_1 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1, 0x0, 0x2, 0x6, 0x8, 0xC, 0xE,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_BG_2 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, 0x2, 0x4, 0x8, 0xA, 0xE);

        __m128i K8_SHUFFLE_GB_0 = SIMD_MM_SETR_EPI8(0x1, 0x3, 0x7, 0x9, 0xD, 0xF,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_GB_1 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1,  -1, 0x3, 0x5, 0x9, 0xB, 0xF,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_GB_2 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, 0x1, 0x5, 0x7, 0xB, 0xD);

        __m128i K8_SHUFFLE_RG_0 = SIMD_MM_SETR_EPI8(0x2, 0x4, 0x8, 0xA, 0xE,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_RG_1 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1, 0x0, 0x4, 0x6, 0xA, 0xC,  -1,  -1,  -1,  -1,  -1,  -1);
        __m128i K8_SHUFFLE_RG_2 = SIMD_MM_SETR_EPI8( -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, 0x0, 0x2, 0x6, 0x8, 0xC, 0xE);

        template <int format, int row, bool align> 
        SIMD_INLINE void BgrToBayer(const uint8_t * bgr, uint8_t * bayer, const __m128i shuffle[4][2][3])
        {
            const __m128i bayer0 = _mm_shuffle_epi8(Load<align>((__m128i*)bgr + 0), shuffle[format][row][0]);
            const __m128i bayer1 = _mm_shuffle_epi8(Load<align>((__m128i*)bgr + 1), shuffle[format][row][1]);
            const __m128i bayer2 = _mm_shuffle_epi8(Load<align>((__m128i*)bgr + 2), shuffle[format][row][2]);
            Store<align>((__m128i*)bayer, _mm_or_si128(_mm_or_si128(bayer0, bayer1), bayer2));
        }

        template <int format, bool align> 
        void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride));

            size_t alignedWidth = AlignLo(width, A);

            const __m128i shuffle[4][2][3] = 
            {
                {{K8_SHUFFLE_GR_0, K8_SHUFFLE_GR_1, K8_SHUFFLE_GR_2}, {K8_SHUFFLE_BG_0, K8_SHUFFLE_BG_1, K8_SHUFFLE_BG_2}}, 
                {{K8_SHUFFLE_GB_0, K8_SHUFFLE_GB_1, K8_SHUFFLE_GB_2}, {K8_SHUFFLE_RG_0, K8_SHUFFLE_RG_1, K8_SHUFFLE_RG_2}}, 
                {{K8_SHUFFLE_RG_0, K8_SHUFFLE_RG_1, K8_SHUFFLE_RG_2}, {K8_SHUFFLE_GB_0, K8_SHUFFLE_GB_1, K8_SHUFFLE_GB_2}}, 
                {{K8_SHUFFLE_BG_0, K8_SHUFFLE_BG_1, K8_SHUFFLE_BG_2}, {K8_SHUFFLE_GR_0, K8_SHUFFLE_GR_1, K8_SHUFFLE_GR_2}} 
            };

            for(size_t row = 0; row < height; row += 2)
            {
                for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += 3*A)
                    BgrToBayer<format, 0, align>(bgr + offset, bayer + col, shuffle);
                if(alignedWidth != width)
                    BgrToBayer<format, 0, false>(bgr + 3*(width - A), bayer + width - A, shuffle);
                bgr += bgrStride;
                bayer += bayerStride;

                for(size_t col = 0, offset = 0; col < alignedWidth; col += A, offset += 3*A)
                    BgrToBayer<format, 1, align>(bgr + offset, bayer + col, shuffle);
                if(alignedWidth != width)
                    BgrToBayer<format, 1, false>(bgr + 3*(width - A), bayer + width - A, shuffle);
                bgr += bgrStride;
                bayer += bayerStride;
            }        
        }

        template<bool align>
        void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat)
        {
            assert((width%2 == 0) && (height%2 == 0));

            switch(bayerFormat)
            {
            case SimdPixelFormatBayerGrbg: 
                BgrToBayer<0, align>(bgr, width, height, bgrStride, bayer, bayerStride);
                break;
            case SimdPixelFormatBayerGbrg:
                BgrToBayer<1, align>(bgr, width, height, bgrStride, bayer, bayerStride);
                break;
            case SimdPixelFormatBayerRggb:
                BgrToBayer<2, align>(bgr, width, height, bgrStride, bayer, bayerStride);
                break;
            case SimdPixelFormatBayerBggr:
                BgrToBayer<3, align>(bgr, width, height, bgrStride, bayer, bayerStride);
                break;
            default:
                assert(0);
            }        
        }

        void BgrToBayer(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bayer, size_t bayerStride, SimdPixelFormatType bayerFormat)
        {
            if(Aligned(bgr) && Aligned(bgrStride) && Aligned(bayer) && Aligned(bayerStride))
                BgrToBayer<true>(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat);
            else
                BgrToBayer<false>(bgr, width, height, bgrStride, bayer, bayerStride, bayerFormat);
        }
    }