Exemplo n.º 1
0
    int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
    {
        int x = 0;

        if( useSIMD )
        {
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= len - 8; x += 8 )
            {
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                __m128i sx = _mm_mullo_epi16(qx, qx);

                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));

                qx = _mm_add_epi16(qx, dx);
            }

            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];
        }

        return x;
    }
Exemplo n.º 2
0
static INLINE __m128i load_coefficients(const tran_low_t* coeff_ptr) {
#if CONFIG_VPX_HIGHBITDEPTH
  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
#else
  return _mm_load_si128((const __m128i*)coeff_ptr);
#endif
}
Exemplo n.º 3
0
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
  if (sizeof(tran_low_t) == 4) {
    return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
                          (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
                          (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
                          (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
  } else {
    return _mm_load_si128((const __m128i *)coeff_ptr);
  }
}
Exemplo n.º 4
0
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments )
{
    typedef uchar T;
    typedef int WT;
    typedef int MT;
    Size size = img.size();
    int y;
    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);

    for( y = 0; y < size.height; y++ )
    {
        const T* ptr = img.ptr<T>(y);
        int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0;

        if( useSIMD )
        {
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                __m128i px = _mm_mullo_epi16(p, qx);
                __m128i sx = _mm_mullo_epi16(qx, qx);
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));

                qx = _mm_add_epi16(qx, dx);
            }
            int CV_DECL_ALIGNED(16) buf[4];
            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];
        }

        for( ; x < size.width; x++ )
        {
            WT p = ptr[x];
            WT xp = x * p, xxp;

            x0 += p;
            x1 += xp;
            xxp = xp * x;
            x2 += xxp;
            x3 += xxp * x;
        }

        WT py = y * x0, sy = y*y;

        mom[9] += ((MT)py) * sy;  // m03
        mom[8] += ((MT)x1) * sy;  // m12
        mom[7] += ((MT)x2) * y;  // m21
        mom[6] += x3;             // m30
        mom[5] += x0 * sy;        // m02
        mom[4] += x1 * y;         // m11
        mom[3] += x2;             // m20
        mom[2] += py;             // m01
        mom[1] += x1;             // m10
        mom[0] += x0;             // m00
    }

    for(int x = 0; x < 10; x++ )
        moments[x] = (double)mom[x];
}
Exemplo n.º 5
0
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
                                      const uint8_t *b, unsigned int width,
                                      unsigned int height, int strength,
                                      int weight, uint32_t *accumulator,
                                      uint16_t *count) {
  unsigned int h;
  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;

  assert(strength >= 0);
  assert(strength <= 6);

  assert(weight >= 0);
  assert(weight <= 2);

  assert(width == 8 || width == 16);

  if (width == 8) {
    __m128i sum_row_a, sum_row_b, sum_row_c;
    __m128i mul_constants = _mm_setr_epi16(
        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

    sum_8(a, b, &sum_row_a);
    sum_8(a + stride, b + width, &sum_row_b);
    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
    sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
    accumulate_and_store_8(sum_row_c, b, count, accumulator);

    a += stride + stride;
    b += width;
    count += width;
    accumulator += width;

    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);

    for (h = 0; h < height - 2; ++h) {
      sum_8(a, b + width, &sum_row_c);
      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
      sum_row_a =
          average_8(sum_row_a, mul_constants, strength, rounding, weight);
      accumulate_and_store_8(sum_row_a, b, count, accumulator);

      a += stride;
      b += width;
      count += width;
      accumulator += width;

      sum_row_a = sum_row_b;
      sum_row_b = sum_row_c;
    }

    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
    sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
    accumulate_and_store_8(sum_row_a, b, count, accumulator);

  } else {  // width == 16
    __m128i sum_row_a_0, sum_row_a_1;
    __m128i sum_row_b_0, sum_row_b_1;
    __m128i sum_row_c_0, sum_row_c_1;
    __m128i mul_constants_0 = _mm_setr_epi16(
                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
            mul_constants_1 = _mm_setr_epi16(
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);

    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
               strength, rounding, weight);
    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);

    a += stride + stride;
    b += width;
    count += width;
    accumulator += width;

    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
    for (h = 0; h < height - 2; ++h) {
      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);

      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);

      average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
                 strength, rounding, weight);
      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);

      a += stride;
      b += width;
      count += width;
      accumulator += width;

      sum_row_a_0 = sum_row_b_0;
      sum_row_a_1 = sum_row_b_1;
      sum_row_b_0 = sum_row_c_0;
      sum_row_b_1 = sum_row_c_1;
    }

    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
               strength, rounding, weight);
    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
  }
}
mlib_status
mlib_VideoColorJFIFYCC2RGB444_S16_naligned(
	mlib_s16 *rgb,
	const mlib_s16 *y,
	const mlib_s16 *cb,
	const mlib_s16 *cr,
	mlib_s32 n)
{
	/* 0 & 1.402*16384 */
	const __m128i x_c1 = _mm_setr_epi16(0, 22970, 0, 22970,
		0, 22970, 0, 22970);

	/* -0.34414*16384 & -0.71414*16384 */
	const __m128i x_c2 = _mm_setr_epi16(-5638, -11700, -5638, -11700,
		-5638, -11700, -5638, -11700);

	/* 1.772*16384 & 0 */
	const __m128i x_c3 = _mm_setr_epi16(29032, 0, 29032, 0,
		29032, 0, 29032, 0);

	const __m128i x_coff = _mm_set1_epi16(2048);
	const __m128i x_cps1 = _mm_set1_epi32(0x8000);
	const __m128i x_cps2 = _mm_set1_epi16(0x8000);
	const __m128i x_zero = _mm_setzero_si128();
	const __m128i x_mask1 = _mm_setr_epi32(0xffffffff, 0xffff, 0, 0);
	const __m128i x_mask2 = _mm_setr_epi32(0, 0xffff0000, 0xffffffff, 0);

	/* __m128i variables */
	__m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_y1, x_y2;
	__m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2, x_t1, x_t2;
	__m128i x_rgbl, x_rgbh, x_rgl, x_rgh, x_bbl, x_bbh;
	__m128i x_cbcr1, x_cbcr2;

	/* pointers */
	__m128i *px_y, *px_cb, *px_cr;
	mlib_s16 *prgb;

	/* other var */
	mlib_d64 fr, fg, fb, fy, fcb, fcr;
	mlib_s32 i;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	prgb = rgb;
	i = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; i <= n - 16; i += 8)	{
		x_y = _mm_loadu_si128(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		x_y2 = _mm_unpackhi_epi16(x_y, x_zero);
		x_y2 = _mm_slli_epi32(x_y2, 4);
		px_y++;
		x_cb = _mm_loadu_si128(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb++;
		x_cr = _mm_loadu_si128(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr++;
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);
		x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		x_t2 = _mm_madd_epi16(x_cbcr2, x_c1);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_r2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c2);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_g2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c3);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_b2 = _mm_add_epi32(x_t2, x_y2);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r2 = _mm_sub_epi32(x_r2, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_r2);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g2 = _mm_sub_epi32(x_g2, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_g2);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b2 = _mm_sub_epi32(x_b2, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_b2);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_rgh = _mm_unpackhi_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);
		x_bbh = _mm_unpackhi_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbh);

		x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbh);
	}

	if (i <= (n - 8)) {
		x_y = _mm_loadu_si128(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		x_y2 = _mm_unpackhi_epi16(x_y, x_zero);
		x_y2 = _mm_slli_epi32(x_y2, 4);
		px_y++;
		x_cb = _mm_loadu_si128(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb++;
		x_cr = _mm_loadu_si128(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr++;
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);
		x_cbcr2 = _mm_unpackhi_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		x_t2 = _mm_madd_epi16(x_cbcr2, x_c1);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_r2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c2);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_g2 = _mm_add_epi32(x_t2, x_y2);
		x_t2 = _mm_madd_epi16(x_cbcr2, x_c3);
		x_t2 = _mm_srai_epi32(x_t2, 10);
		x_b2 = _mm_add_epi32(x_t2, x_y2);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r2 = _mm_sub_epi32(x_r2, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_r2);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g2 = _mm_sub_epi32(x_g2, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_g2);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b2 = _mm_sub_epi32(x_b2, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_b2);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_rgh = _mm_unpackhi_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);
		x_bbh = _mm_unpackhi_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbh);

		x_rgbl = _mm_unpacklo_epi32(x_rgh, x_bbh);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgh, x_bbh);
		PACK_RGB2(x_rgbh);

		i += 8;
	}

	if (i <= (n - 4)) {
		x_y = _mm_loadl_epi64(px_y);
		x_y1 = _mm_unpacklo_epi16(x_y, x_zero);
		x_y1 = _mm_slli_epi32(x_y1, 4);
		px_y = (__m128i *)(((__m64 *)px_y) + 1);
		x_cb = _mm_loadl_epi64(px_cb);
		x_cb = _mm_sub_epi16(x_cb, x_coff);
		px_cb = (__m128i *)(((__m64 *)px_cb) + 1);
		x_cr = _mm_loadl_epi64(px_cr);
		x_cr = _mm_sub_epi16(x_cr, x_coff);
		px_cr = (__m128i *)(((__m64 *)px_cr) + 1);
		x_cbcr1 = _mm_unpacklo_epi16(x_cb, x_cr);

		/* calc r/g/b */
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c1);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_r1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c2);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_g1 = _mm_add_epi32(x_t1, x_y1);
		x_t1 = _mm_madd_epi16(x_cbcr1, x_c3);
		x_t1 = _mm_srai_epi32(x_t1, 10);
		x_b1 = _mm_add_epi32(x_t1, x_y1);

		/* signed pack & shift */
		x_r1 = _mm_sub_epi32(x_r1, x_cps1);
		x_r = _mm_packs_epi32(x_r1, x_zero);
		x_r = _mm_add_epi16(x_r, x_cps2);
		x_r = _mm_srli_epi16(x_r, 4);

		x_g1 = _mm_sub_epi32(x_g1, x_cps1);
		x_g = _mm_packs_epi32(x_g1, x_zero);
		x_g = _mm_add_epi16(x_g, x_cps2);
		x_g = _mm_srli_epi16(x_g, 4);

		x_b1 = _mm_sub_epi32(x_b1, x_cps1);
		x_b = _mm_packs_epi32(x_b1, x_zero);
		x_b = _mm_add_epi16(x_b, x_cps2);
		x_b = _mm_srli_epi16(x_b, 4);

		/* create rgb sequences */
		x_rgl = _mm_unpacklo_epi16(x_r, x_g);
		x_bbl = _mm_unpacklo_epi16(x_b, x_b);

		/* save */
		x_rgbl = _mm_unpacklo_epi32(x_rgl, x_bbl);
		PACK_RGB1(x_rgbl);

		x_rgbh = _mm_unpackhi_epi32(x_rgl, x_bbl);
		PACK_RGB2(x_rgbh);

		i += 4;
	}

	/* pure C implementation */
	for (; i < n; i++) {
		fy = y[i] * SCALE - SAT;
		fcb = (mlib_d64)((cb[i] - 2048) << 20);
		fcr = (mlib_d64)((cr[i] - 2048) << 20);
		fr = fy + 1.40200f * fcr;
		fg = fy - 0.34414f * fcb - 0.71414f * fcr;
		fb = fy + 1.77200f * fcb;
		rgb[3 * i] = CLAMP_U12(fr);
		rgb[3 * i + 1] = CLAMP_U12(fg);
		rgb[3 * i + 2] = CLAMP_U12(fb);
	}

	return (MLIB_SUCCESS);
}
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  const int stride = pitch >> 1;
  int pass;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
  const __m128i kOne = _mm_set1_epi16(1);
  __m128i in0, in1, in2, in3;
  // Load inputs.
  {
    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
    // x = x << 4
    in0 = _mm_slli_epi16(in0, 4);
    in1 = _mm_slli_epi16(in1, 4);
    in2 = _mm_slli_epi16(in2, 4);
    in3 = _mm_slli_epi16(in3, 4);
    // if (i == 0 && input[0]) input[0] += 1;
    {
      // The mask will only contain wether the first value is zero, all
      // other comparison will fail as something shifted by 4 (above << 4)
      // can never be equal to one. To increment in the non-zero case, we
      // add the mask and one for the first element:
      //   - if zero, mask = -1, v = v - 1 + 1 = v
      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
      in0 = _mm_add_epi16(in0, mask);
      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
    }
  }
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    // Transform 1/2: Add/substract
    const __m128i r0 = _mm_add_epi16(in0, in3);
    const __m128i r1 = _mm_add_epi16(in1, in2);
    const __m128i r2 = _mm_sub_epi16(in1, in2);
    const __m128i r3 = _mm_sub_epi16(in0, in3);
    // Transform 1/2: Interleave to do the multiply by constants which gets us
    //                into 32 bits.
    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    // Combine and transpose
    const __m128i res0 = _mm_packs_epi32(w0, w2);
    const __m128i res1 = _mm_packs_epi32(w4, w6);
    // 00 01 02 03 20 21 22 23
    // 10 11 12 13 30 31 32 33
    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
    // 00 10 01 11 02 12 03 13
    // 20 30 21 31 22 32 23 33
    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
    if (0 == pass) {
      // Extract values in the high part for second pass as transform code
      // only uses the first four values.
      in1 = _mm_unpackhi_epi64(in0, in0);
      in3 = _mm_unpackhi_epi64(in2, in2);
    } else {
      // Post-condition output and store it (v + 1) >> 2, taking advantage
      // of the fact 1/3 are stored just after 0/2.
      __m128i out01 = _mm_add_epi16(in0, kOne);
      __m128i out23 = _mm_add_epi16(in2, kOne);
      out01 = _mm_srai_epi16(out01, 2);
      out23 = _mm_srai_epi16(out23, 2);
      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
    }
  }
}