Esempio n. 1
0
static void rfx_decode_ycbcr_to_rgb_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
{	
	__m128i zero = _mm_setzero_si128();
	__m128i max = _mm_set1_epi16(255);

	__m128i* y_r_buf = (__m128i*) y_r_buffer;
	__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
	__m128i* cr_b_buf = (__m128i*) cr_b_buffer;

	__m128i y;
	__m128i cr;
	__m128i cb;
	__m128i r;
	__m128i g;
	__m128i b;

	int i;

	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
	{
		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
		_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
		_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
	}
	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
	{
		/* y = (y_r_buf[i] >> 5) + 128; */
		y = _mm_load_si128(&y_r_buf[i]);
		y = _mm_add_epi16(_mm_srai_epi16(y, 5), _mm_set1_epi16(128));

		/* cr = cr_b_buf[i]; */
		cr = _mm_load_si128(&cr_b_buf[i]);

		/* r = y + ((cr >> 5) + (cr >> 7) + (cr >> 8) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
		/* y_r_buf[i] = MINMAX(r, 0, 255); */
		r = _mm_add_epi16(y, _mm_srai_epi16(cr, 5));
		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 7));
		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 8));
		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 11));
		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 12));
		r = _mm_add_epi16(r, _mm_srai_epi16(cr, 13));
		_mm_between_epi16(r, zero, max);
		_mm_store_si128(&y_r_buf[i], r);

		/* cb = cb_g_buf[i]; */
		cb = _mm_load_si128(&cb_g_buf[i]);

		/* g = y - ((cb >> 7) + (cb >> 9) + (cb >> 10)) -
			((cr >> 6) + (cr >> 8) + (cr >> 9) + (cr >> 11) + (cr >> 12) + (cr >> 13)); */
		/* cb_g_buf[i] = MINMAX(g, 0, 255); */
		g = _mm_sub_epi16(y, _mm_srai_epi16(cb, 7));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 9));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cb, 10));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 6));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 8));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 9));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 11));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 12));
		g = _mm_sub_epi16(g, _mm_srai_epi16(cr, 13));
		_mm_between_epi16(g, zero, max);
		_mm_store_si128(&cb_g_buf[i], g);

		/* b = y + ((cb >> 5) + (cb >> 6) + (cb >> 7) + (cb >> 11) + (cb >> 13)); */
		/* cr_b_buf[i] = MINMAX(b, 0, 255); */
		b = _mm_add_epi16(y, _mm_srai_epi16(cb, 5));
		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 6));
		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 7));
		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 11));
		b = _mm_add_epi16(b, _mm_srai_epi16(cb, 13));
		_mm_between_epi16(b, zero, max);
		_mm_store_si128(&cr_b_buf[i], b);
	}
}
Esempio n. 2
0
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point
 * numbers. See the general code above.
 */
PRIM_STATIC pstatus_t sse2_RGBToYCbCr_16s16s_P3P3(
	const INT16 *pSrc[3],
	int srcStep,
	INT16 *pDst[3],
	int dstStep,
	const prim_size_t *roi)	/* region of interest */
{
	__m128i min, max, y_r, y_g, y_b, cb_r, cb_g, cb_b, cr_r, cr_g, cr_b;
	__m128i *r_buf, *g_buf, *b_buf, *y_buf, *cb_buf, *cr_buf;
	int srcbump, dstbump, yp, imax;

	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
			|| (roi->width & 0x07)
			|| (srcStep & 127)
			|| (dstStep & 127))
	{
		/* We can't maintain 16-byte alignment. */
		return general_RGBToYCbCr_16s16s_P3P3(pSrc, srcStep,
			pDst, dstStep, roi);
	}

	min = _mm_set1_epi16(-128 << 5);
	max = _mm_set1_epi16(127 << 5);

	r_buf  = (__m128i*) (pSrc[0]);
	g_buf  = (__m128i*) (pSrc[1]);
	b_buf  = (__m128i*) (pSrc[2]);
	y_buf  = (__m128i*) (pDst[0]);
	cb_buf = (__m128i*) (pDst[1]);
	cr_buf = (__m128i*) (pDst[2]);

	y_r  = _mm_set1_epi16(9798);   /*  0.299000 << 15 */
	y_g  = _mm_set1_epi16(19235);  /*  0.587000 << 15 */
	y_b  = _mm_set1_epi16(3735);   /*  0.114000 << 15 */
	cb_r = _mm_set1_epi16(-5535);  /* -0.168935 << 15 */
	cb_g = _mm_set1_epi16(-10868); /* -0.331665 << 15 */
	cb_b = _mm_set1_epi16(16403);  /*  0.500590 << 15 */
	cr_r = _mm_set1_epi16(16377);  /*  0.499813 << 15 */
	cr_g = _mm_set1_epi16(-13714); /* -0.418531 << 15 */
	cr_b = _mm_set1_epi16(-2663);  /* -0.081282 << 15 */

	srcbump = srcStep / sizeof(__m128i);
	dstbump = dstStep / sizeof(__m128i);

#ifdef DO_PREFETCH
	/* Prefetch RGB's. */
	for (yp=0; yp<roi->height; yp++)
	{
		int i;
		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
		{
			_mm_prefetch((char*)(&r_buf[i]), _MM_HINT_NTA);
			_mm_prefetch((char*)(&g_buf[i]), _MM_HINT_NTA);
			_mm_prefetch((char*)(&b_buf[i]), _MM_HINT_NTA);
		}
		r_buf += srcbump;
		g_buf += srcbump;
		b_buf += srcbump;
	}
	r_buf = (__m128i*) (pSrc[0]);
	g_buf = (__m128i*) (pSrc[1]);
	b_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */

	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
	for (yp=0; yp<roi->height; ++yp)
	{
		int i;
		for (i=0; i<imax; i++)
		{
			/* In order to use SSE2 signed 16-bit integer multiplication we
			 * need to convert the floating point factors to signed int
			 * without loosing information.  The result of this multiplication
			 * is 32 bit and using SSE2 we get either the product's hi or lo
			 * word.  Thus we will multiply the factors by the highest
			 * possible 2^n and take the upper 16 bits of the signed 32-bit
			 * result (_mm_mulhi_epi16).  Since the final result needs to
			 * be scaled by << 5 and also in in order to keep the precision
			 * within the upper 16 bits we will also have to scale the RGB
			 * values used in the multiplication by << 5+(16-n).
			 */
			__m128i r, g, b, y, cb, cr;
			r = _mm_load_si128(y_buf+i);
			g = _mm_load_si128(g_buf+i);
			b = _mm_load_si128(b_buf+i);

			/* r<<6; g<<6; b<<6 */
			r = _mm_slli_epi16(r, 6);
			g = _mm_slli_epi16(g, 6);
			b = _mm_slli_epi16(b, 6);

			/* y = HIWORD(r*y_r) + HIWORD(g*y_g) + HIWORD(b*y_b) + min */
			y = _mm_mulhi_epi16(r, y_r);
			y = _mm_add_epi16(y, _mm_mulhi_epi16(g, y_g));
			y = _mm_add_epi16(y, _mm_mulhi_epi16(b, y_b));
			y = _mm_add_epi16(y, min);
			/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
			_mm_between_epi16(y, min, max);
			_mm_store_si128(y_buf+i, y);

			/* cb = HIWORD(r*cb_r) + HIWORD(g*cb_g) + HIWORD(b*cb_b) */
			cb = _mm_mulhi_epi16(r, cb_r);
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(g, cb_g));
			cb = _mm_add_epi16(cb, _mm_mulhi_epi16(b, cb_b));
			/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
			_mm_between_epi16(cb, min, max);
			_mm_store_si128(cb_buf+i, cb);

			/* cr = HIWORD(r*cr_r) + HIWORD(g*cr_g) + HIWORD(b*cr_b) */
			cr = _mm_mulhi_epi16(r, cr_r);
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(g, cr_g));
			cr = _mm_add_epi16(cr, _mm_mulhi_epi16(b, cr_b));
			/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
			_mm_between_epi16(cr, min, max);
			_mm_store_si128(cr_buf+i, cr);
		}
		y_buf  += srcbump;
		cb_buf += srcbump;
		cr_buf += srcbump;
		r_buf += dstbump;
		g_buf += dstbump;
		b_buf += dstbump;
	}

	return PRIMITIVES_SUCCESS;
}
Esempio n. 3
0
/* The encodec YCbCr coeffectients are represented as 11.5 fixed-point numbers. See rfx_encode.c */
static void rfx_encode_rgb_to_ycbcr_sse2(sint16* y_r_buffer, sint16* cb_g_buffer, sint16* cr_b_buffer)
{
	__m128i min = _mm_set1_epi16(-128 << 5);
	__m128i max = _mm_set1_epi16(127 << 5);

	__m128i* y_r_buf = (__m128i*) y_r_buffer;
	__m128i* cb_g_buf = (__m128i*) cb_g_buffer;
	__m128i* cr_b_buf = (__m128i*) cr_b_buffer;

	__m128i y;
	__m128i cr;
	__m128i cb;
	__m128i r;
	__m128i g;
	__m128i b;

	int i;

	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i += (CACHE_LINE_BYTES / sizeof(__m128i)))
	{
		_mm_prefetch((char*)(&y_r_buf[i]), _MM_HINT_NTA);
		_mm_prefetch((char*)(&cb_g_buf[i]), _MM_HINT_NTA);
		_mm_prefetch((char*)(&cr_b_buf[i]), _MM_HINT_NTA);
	}
	for (i = 0; i < (4096 * sizeof(sint16) / sizeof(__m128i)); i++)
	{
		/* r = y_r_buf[i]; */
		r = _mm_load_si128(&y_r_buf[i]);

		/* g = cb_g_buf[i]; */
		g = _mm_load_si128(&cb_g_buf[i]);

		/* b = cr_b_buf[i]; */
		b = _mm_load_si128(&cr_b_buf[i]);

		/* y = ((r << 3) + (r) + (r >> 1) + (r >> 4) + (r >> 7)) +
			((g << 4) + (g << 1) + (g >> 1) + (g >> 2) + (g >> 5)) +
			((b << 1) + (b) + (b >> 1) + (b >> 3) + (b >> 6) + (b >> 7)); */
		/* y_r_buf[i] = MINMAX(y, 0, (255 << 5)) - (128 << 5); */
		y = _mm_add_epi16(_mm_slli_epi16(r, 3), r);
		y = _mm_add_epi16(y, _mm_srai_epi16(r, 1));
		y = _mm_add_epi16(y, _mm_srai_epi16(r, 4));
		y = _mm_add_epi16(y, _mm_srai_epi16(r, 7));
		y = _mm_add_epi16(y, _mm_slli_epi16(g, 4));
		y = _mm_add_epi16(y, _mm_slli_epi16(g, 1));
		y = _mm_add_epi16(y, _mm_srai_epi16(g, 1));
		y = _mm_add_epi16(y, _mm_srai_epi16(g, 2));
		y = _mm_add_epi16(y, _mm_srai_epi16(g, 5));
		y = _mm_add_epi16(y, _mm_slli_epi16(b, 1));
		y = _mm_add_epi16(y, b);
		y = _mm_add_epi16(y, _mm_srai_epi16(b, 1));
		y = _mm_add_epi16(y, _mm_srai_epi16(b, 3));
		y = _mm_add_epi16(y, _mm_srai_epi16(b, 6));
		y = _mm_add_epi16(y, _mm_srai_epi16(b, 7));
		y = _mm_add_epi16(y, min);
		_mm_between_epi16(y, min, max);
		_mm_store_si128(&y_r_buf[i], y);

		/* cb = 0 - ((r << 2) + (r) + (r >> 2) + (r >> 3) + (r >> 5)) -
			((g << 3) + (g << 1) + (g >> 1) + (g >> 4) + (g >> 5) + (g >> 6)) +
			((b << 4) + (b >> 6)); */
		/* cb_g_buf[i] = MINMAX(cb, (-128 << 5), (127 << 5)); */
		cb = _mm_add_epi16(_mm_slli_epi16(b, 4), _mm_srai_epi16(b, 6));
		cb = _mm_sub_epi16(cb, _mm_slli_epi16(r, 2));
		cb = _mm_sub_epi16(cb, r);
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 2));
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 3));
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(r, 5));
		cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 3));
		cb = _mm_sub_epi16(cb, _mm_slli_epi16(g, 1));
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 1));
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 4));
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 5));
		cb = _mm_sub_epi16(cb, _mm_srai_epi16(g, 6));
		_mm_between_epi16(cb, min, max);
		_mm_store_si128(&cb_g_buf[i], cb);

		/* cr = ((r << 4) - (r >> 7)) -
			((g << 3) + (g << 2) + (g) + (g >> 2) + (g >> 3) + (g >> 6)) -
			((b << 1) + (b >> 1) + (b >> 4) + (b >> 5) + (b >> 7)); */
		/* cr_b_buf[i] = MINMAX(cr, (-128 << 5), (127 << 5)); */
		cr = _mm_sub_epi16(_mm_slli_epi16(r, 4), _mm_srai_epi16(r, 7));
		cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 3));
		cr = _mm_sub_epi16(cr, _mm_slli_epi16(g, 2));
		cr = _mm_sub_epi16(cr, g);
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 2));
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 3));
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(g, 6));
		cr = _mm_sub_epi16(cr, _mm_slli_epi16(b, 1));
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 1));
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 4));
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 5));
		cr = _mm_sub_epi16(cr, _mm_srai_epi16(b, 7));
		_mm_between_epi16(cr, min, max);
		_mm_store_si128(&cr_b_buf[i], cr);
	}
}
Esempio n. 4
0
/*---------------------------------------------------------------------------*/
PRIM_STATIC pstatus_t sse2_yCbCrToRGB_16s16s_P3P3(
	const INT16 *pSrc[3],
	int srcStep,
	INT16 *pDst[3],
	int dstStep,
	const prim_size_t *roi)	/* region of interest */
{
	__m128i zero, max, r_cr, g_cb, g_cr, b_cb, c4096;
	__m128i *y_buf, *cb_buf, *cr_buf, *r_buf, *g_buf, *b_buf;
	int srcbump, dstbump, yp, imax;

	if (((ULONG_PTR) (pSrc[0]) & 0x0f)
			|| ((ULONG_PTR) (pSrc[1]) & 0x0f)
			|| ((ULONG_PTR) (pSrc[2]) & 0x0f)
			|| ((ULONG_PTR) (pDst[0]) & 0x0f)
			|| ((ULONG_PTR) (pDst[1]) & 0x0f)
			|| ((ULONG_PTR) (pDst[2]) & 0x0f)
			|| (roi->width & 0x07)
			|| (srcStep & 127)
			|| (dstStep & 127))
	{
		/* We can't maintain 16-byte alignment. */
		return general_yCbCrToRGB_16s16s_P3P3(pSrc, srcStep,
			pDst, dstStep, roi);
	}

	zero = _mm_setzero_si128();
	max = _mm_set1_epi16(255);

	y_buf  = (__m128i*) (pSrc[0]);
	cb_buf = (__m128i*) (pSrc[1]);
	cr_buf = (__m128i*) (pSrc[2]);
	r_buf  = (__m128i*) (pDst[0]);
	g_buf  = (__m128i*) (pDst[1]);
	b_buf  = (__m128i*) (pDst[2]);

	r_cr = _mm_set1_epi16(22986);	/*  1.403 << 14 */
	g_cb = _mm_set1_epi16(-5636);	/* -0.344 << 14 */
	g_cr = _mm_set1_epi16(-11698);	/* -0.714 << 14 */
	b_cb = _mm_set1_epi16(28999);	/*  1.770 << 14 */
	c4096 = _mm_set1_epi16(4096);
	srcbump = srcStep / sizeof(__m128i);
	dstbump = dstStep / sizeof(__m128i);

#ifdef DO_PREFETCH
	/* Prefetch Y's, Cb's, and Cr's. */
	for (yp=0; yp<roi->height; yp++)
	{
		int i;
		for (i=0; i<roi->width * sizeof(INT16) / sizeof(__m128i);
			i += (CACHE_LINE_BYTES / sizeof(__m128i)))
		{
			_mm_prefetch((char*)(&y_buf[i]),  _MM_HINT_NTA);
			_mm_prefetch((char*)(&cb_buf[i]), _MM_HINT_NTA);
			_mm_prefetch((char*)(&cr_buf[i]), _MM_HINT_NTA);
		}
		y_buf  += srcbump;
		cb_buf += srcbump;
		cr_buf += srcbump;
	}
	y_buf  = (__m128i*) (pSrc[0]);
	cb_buf = (__m128i*) (pSrc[1]);
	cr_buf = (__m128i*) (pSrc[2]);
#endif /* DO_PREFETCH */

	imax = roi->width * sizeof(INT16) / sizeof(__m128i);
	for (yp=0; yp<roi->height; ++yp)
	{
		int i;
		for (i=0; i<imax; i++)
		{
			/* In order to use SSE2 signed 16-bit integer multiplication
			 * we need to convert the floating point factors to signed int
			 * without losing information.
			 * The result of this multiplication is 32 bit and we have two
			 * SSE instructions that return either the hi or lo word.
			 * Thus we will multiply the factors by the highest possible 2^n,
			 * take the upper 16 bits of the signed 32-bit result
			 * (_mm_mulhi_epi16) and correct this result by multiplying
			 * it by 2^(16-n).
			 *
			 * For the given factors in the conversion matrix the best
			 * possible n is 14.
			 *
			 * Example for calculating r:
			 * r = (y>>5) + 128 + (cr*1.403)>>5             // our base formula
			 * r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5   // see above
			 * r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5     // simplification
			 * r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
			 */

			/* y = (y_r_buf[i] + 4096) >> 2 */
			__m128i y, cb, cr, r, g, b;
			y = _mm_load_si128(y_buf + i);
			y = _mm_add_epi16(y, c4096);
			y = _mm_srai_epi16(y, 2);
			/* cb = cb_g_buf[i]; */
			cb = _mm_load_si128(cb_buf + i);
			/* cr = cr_b_buf[i]; */
			cr = _mm_load_si128(cr_buf + i);

			/* (y + HIWORD(cr*22986)) >> 3 */
			r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
			r = _mm_srai_epi16(r, 3);

			/* r_buf[i] = MINMAX(r, 0, 255); */
			_mm_between_epi16(r, zero, max);
			_mm_store_si128(r_buf + i, r);

			/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
			g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
			g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
			g = _mm_srai_epi16(g, 3);

			/* g_buf[i] = MINMAX(g, 0, 255); */
			_mm_between_epi16(g, zero, max);
			_mm_store_si128(g_buf + i, g);

			/* (y + HIWORD(cb*28999)) >> 3 */
			b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
			b = _mm_srai_epi16(b, 3);
			/* b_buf[i] = MINMAX(b, 0, 255); */
			_mm_between_epi16(b, zero, max);
			_mm_store_si128(b_buf + i, b);
		}
		y_buf  += srcbump;
		cb_buf += srcbump;
		cr_buf += srcbump;
		r_buf += dstbump;
		g_buf += dstbump;
		b_buf += dstbump;
	}

	return PRIMITIVES_SUCCESS;
}