static void GF_FUNC_ALIGN VS_CC
float_to_dst_gb_16bit(const float *srcp, uint8_t *d, int width, int height,
                      int src_stride, int dst_stride, float th, int bits)
{
    uint16_t *dstp = (uint16_t *)d;
    dst_stride /= 2;
    __m128i tmax = _mm_set1_epi32((1 << bits) - 1);

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 8) {
            __m128i xmm0 = _mm_cvtps_epi32(_mm_load_ps(srcp + x));
            __m128i xmm1 = _mm_cvtps_epi32(_mm_load_ps(srcp + x + 4));
            xmm0 = _mm_packs_epi32(mm_min_epi32(tmax, xmm0),
                                   mm_min_epi32(tmax, xmm1));

            _mm_store_si128((__m128i *)(dstp + x), xmm0);
        }
        srcp += src_stride;
        dstp += dst_stride;
    }
}
Example #2
0
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x)
{
	__m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23));
	__m128i inf = _mm_set1_epi32((uint32_t)255UL << 23);
	__m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23);
	__m128i sign_mask = _mm_set1_epi32(0x80000000UL);
	__m128i round_mask = _mm_set1_epi32(~0x0FFFU);

	__m128i ret_0x7E00 = _mm_set1_epi32(0x7E00);
	__m128i ret_0x7C00 = _mm_set1_epi32(0x7C00);

	__m128i f, sign, ge_inf, eq_inf;

	f = _mm_castps_si128(x);
	sign = _mm_and_si128(f, sign_mask);
	f = _mm_xor_si128(f, sign);

	ge_inf = _mm_cmpgt_epi32(f, inf);
	eq_inf = _mm_cmpeq_epi32(f, inf);

	f = _mm_and_si128(f, round_mask);
	f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic));
	f = _mm_sub_epi32(f, round_mask);

	f = mm_min_epi32(f, f16inf);
	f = _mm_srli_epi32(f, 13);

	f = mm_blendv_ps(ret_0x7E00, f, ge_inf);
	f = mm_blendv_ps(ret_0x7C00, f, eq_inf);

	sign = _mm_srli_epi32(sign, 16);
	f = _mm_or_si128(f, sign);

	f = mm_packus_epi32(f, _mm_setzero_si128());
	return f;
}
Example #3
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
                uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t* p0 = (uint16_t *)buff + 8;
    uint16_t* p1 = p0 + bstride;
    uint16_t* p2 = p1 + bstride;
    uint16_t* p3 = p2 + bstride;
    uint16_t* p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 alpha = _mm_set1_ps((float)0.96043387);
    __m128 beta = _mm_set1_ps((float)0.39782473);
    __m128i pmax = _mm_set1_epi32(0xFFFF);
    __m128i min = _mm_set1_epi16((int16_t)eh->min);
    __m128i max = _mm_set1_epi16((int16_t)eh->max);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint16_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 8) {
            __m128 sumx[2] = {(__m128)zero, (__m128)zero};
            __m128 sumy[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 4; i++) {
                __m128 xmul = _mm_load_ps(ar_mulxf[i]);
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));

                xmul = _mm_load_ps(ar_mulyf[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));
            }

            __m128i out[2];
            for (int i = 0; i < 2; i++) {
                sumx[i] = mm_abs_ps(sumx[i]);
                sumy[i] = mm_abs_ps(sumy[i]);
                __m128 t0 = _mm_max_ps(sumx[i], sumy[i]);
                __m128 t1 = _mm_min_ps(sumx[i], sumy[i]);
                t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1));
                out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift);
                out[i] = mm_min_epi32(out[i], pmax);
            }
            out[0] = mm_cast_epi32(out[0], out[1]);

            out[1] = MM_MIN_EPU16(out[0], max);
            out[1] = _mm_cmpeq_epi16(out[1], max);
            out[0] = _mm_or_si128(out[1], out[0]);

            out[1] = MM_MAX_EPU16(out[0], min);
            out[1] = _mm_cmpeq_epi16(out[1], min);
            out[0] = _mm_andnot_si128(out[1], out[0]);

            _mm_store_si128((__m128i *)(dstp + x), out[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Example #4
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i max = _mm_set1_epi32(0xFFFF);
    __m128 matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_set1_ps((float)ch->m[i]);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };
        
        for (int x = 0; x < width; x += 8) {
            __m128 sum[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 25; i++) {
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                xmm1 = _mm_mul_ps(xmm1, matrix[i]);
                xmm2 = _mm_mul_ps(xmm2, matrix[i]);
                sum[0] = _mm_add_ps(sum[0], xmm1);
                sum[1] = _mm_add_ps(sum[1], xmm2);
            }

            __m128i sumi[2];
            for (int i = 0; i < 2; i++) {
                sum[i]  = _mm_mul_ps(sum[i], rdiv);
                sum[i]  = _mm_add_ps(sum[i], bias);
                if (!ch->saturate) {
                    sum[i] = mm_abs_ps(sum[i]);
                }
                sumi[i] = _mm_cvtps_epi32(sum[i]);
                sumi[i] = mm_min_epi32(sumi[i], max);
                __m128i mask = _mm_cmpgt_epi32(sumi[i], zero);
                sumi[i] = _mm_and_si128(sumi[i], mask);
            }

            sumi[0] = mm_cast_epi32(sumi[0], sumi[1]);

            _mm_store_si128((__m128i *)(dstp + x), sumi[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}