Пример #1
0
OD_SIMD_INLINE __m64 od_load_convert_subtract_x4(const unsigned char *src_p,
 const unsigned char *ref_p) {
  __m64 src_vec;
  __m64 ref_vec;
  src_vec = _mm_cvtsi32_si64(*((uint32_t *)src_p));
  ref_vec = _mm_cvtsi32_si64(*((uint32_t *)ref_p));
  src_vec = _mm_unpacklo_pi8(src_vec, ref_vec);
  ref_vec = _mm_unpacklo_pi8(ref_vec, ref_vec);
  return _mm_sub_pi16(src_vec, ref_vec);
}
Пример #2
0
void DrawAAPMMX(PixelBlock& w, int x, int y, Color c)
{
	if(!Rect(w.GetSize()).Contains(Rect(x, y, x + 6, y + 11)))
		return;
	dword *a = w.PointAdr(x, y);
	int d = w.LineDelta();
	__m64 zero = _mm_setzero_si64();
	__m64 mc = _mm_unpacklo_pi8(_mm_cvtsi32_si64(c.GetRaw()), zero);
	__m64 mask = _mm_set1_pi16(0xff);
	const byte *s = aa_packed;
	dword *t = a;
	__m64 alpha;
	__m64 h;
	__m64 m;
	for(;;) {
		dword c = *s++;
		if(c == 0)
			break;
		t += (c >> 3) & 15;
		switch(c & 7) {
		case 7:
			AAPMMX_(6);
		case 6:
			AAPMMX_(5);
		case 5:
			AAPMMX_(4);
		case 4:
			AAPMMX_(3);
		case 3:
			AAPMMX_(2);
		case 2:
			AAPMMX_(1);
		case 1:
			AAPMMX_(0);
		}
		t += c & 7;
		s += c & 7;
		if(c & 0x80) {
			a += d;
			t = a;
		}
	}
	_mm_empty();
}
Пример #3
0
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
{
   const unsigned char
      *u = src + ((y-1) & 7)*sc2lines_width,
      *m = src + ((y+0) & 7)*sc2lines_width,
      *l = src + ((y+1) & 7)*sc2lines_width;

   // process 4pix per iteration
   for (unsigned i = 0; i < nPix; i += 4)
   {
      unsigned dw = *(unsigned*)(m+i);
      __m64 v1 = _mm_cvtsi32_si64(dw); // v1   =     0|    0|    0|    0|dw[3]|dw[2]|dw[1]|dw[0]
      v1 = _mm_unpacklo_pi8(v1,v1);    // v1   = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
      *(__m64*)(dst1 + 2*i) = v1;      // e0e1 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]
      *(__m64*)(dst2 + 2*i) = v1;      // e2e3 = dw[3]|dw[3]|dw[2]|dw[2]|dw[1]|dw[1]|dw[0]|dw[0]

      dw = *(unsigned*)(u+i) ^ *(unsigned*)(l+i);
      if (!dw)
          continue; // u == l

   #define process_pix(n)                                       \
      if ((dw & (0xFF << (8*n))) && m[i+n-1] != m[i+n+1])       \
      {                                                         \
         if (u[i+n] == m[i+n-1])                                \
             dst1[2*(i+n)] = u[i+n];                            \
         if (u[i+n] == m[i+n+1])                                \
             dst1[2*(i+n)+1] = u[i+n];                          \
         if (l[i+n] == m[i+n-1])                                \
             dst2[2*(i+n)] = l[i+n];                            \
         if (l[i+n] == m[i+n+1])                                \
             dst2[2*(i+n)+1] = l[i+n];                          \
      }

      process_pix(0);
      process_pix(1);
      process_pix(2);
      process_pix(3);
   #undef process_pix
   }
}
Пример #4
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB16.
VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv, r, g, b;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 2);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0

			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);			// 8 bits (0 - 7)
			b0 = _mm_srai_pi16(b0, SCALEBITS);

			b1 = _mm_packs_pu16(b1, mz);				// 0 0 0 0 b7 b6 b5 b4
			b0 = _mm_packs_pu16(b0, mz);				// 0 0 0 0 b3 b2 b1 b0
			b1 = _mm_unpacklo_pi8(b1, mz);				// 0 b7 0b 6 0 b5 0 b4
			b0 = _mm_unpacklo_pi8(b0, mz);

			b1 = _mm_srli_pi16(b1, 3);
			b0 = _mm_srli_pi16(b0, 3);					// 5 bits (0 - 4)

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);			// 8 bits (0 - 7)
			g0 = _mm_srai_pi16(g0, SCALEBITS);

			g1 = _mm_packs_pu16(g1, mz);				// 0 0 0 0 g7 g6 g5 g4
			g0 = _mm_packs_pu16(g0, mz);				// 0 0 0 0 g3 g2 g1 g0
			g1 = _mm_unpacklo_pi8(g1, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);

			g1 = _mm_srli_pi16(g1, 2);					// 6 bits (0 - 5)
			g0 = _mm_srli_pi16(g0, 2);
			g1 = _mm_slli_pi16(g1, 5);					// 6 bits (5 - 10)
			g0 = _mm_slli_pi16(g0, 5);					// 6 bits (5 - 10)

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);

			r1 = _mm_packs_pu16(r1, mz);				// 0 0 0 0 r7 r6 r5 r4
			r0 = _mm_packs_pu16(r0, mz);				// 0 0 0 0 r3 r2 r1 r0
			r1 = _mm_unpacklo_pi8(r1, mz);				// 0 r7 0 r6 0 r5 0 r4
			r0 = _mm_unpacklo_pi8(r0, mz);

			r1 = _mm_srli_pi16(r1, 3);					// 5 bits (0 - 4)
			r0 = _mm_srli_pi16(r0, 3);
			r1 = _mm_slli_pi16(r1, 11);					// 5 bits (11 - 15)
			r0 = _mm_slli_pi16(r0, 11);					// 5 bits (11 - 15)

			// Combine RGB.
			b0 = _mm_or_si64(g0, b0);
			b0 = _mm_or_si64(r0, b0);					// 16 bits rgb
			b1 = _mm_or_si64(g1, b1);
			b1 = _mm_or_si64(r1, b1);					// 16 bits rgb

			// Write out row 0.
			((PM64) pbDstX)[0] = b0;
			((PM64) pbDstX)[1] = b1;

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);			// 8 bits (0 - 7)
			b0 = _mm_srai_pi16(b0, SCALEBITS);

			b1 = _mm_packs_pu16(b1, mz);				// 0 0 0 0 b7 b6 b5 b4
			b0 = _mm_packs_pu16(b0, mz);				// 0 0 0 0 b3 b2 b1 b0
			b1 = _mm_unpacklo_pi8(b1, mz);				// 0 b7 0b 6 0 b5 0 b4
			b0 = _mm_unpacklo_pi8(b0, mz);

			b1 = _mm_srli_pi16(b1, 3);
			b0 = _mm_srli_pi16(b0, 3);					// 5 bits (0 - 4)

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);			// 8 bits (0 - 7)
			g0 = _mm_srai_pi16(g0, SCALEBITS);

			g1 = _mm_packs_pu16(g1, mz);				// 0 0 0 0 g7 g6 g5 g4
			g0 = _mm_packs_pu16(g0, mz);				// 0 0 0 0 g3 g2 g1 g0
			g1 = _mm_unpacklo_pi8(g1, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);

			g1 = _mm_srli_pi16(g1, 2);					// 6 bits (0 - 5)
			g0 = _mm_srli_pi16(g0, 2);
			g1 = _mm_slli_pi16(g1, 5);					// 6 bits (5 - 10)
			g0 = _mm_slli_pi16(g0, 5);					// 6 bits (5 - 10)

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);

			r1 = _mm_packs_pu16(r1, mz);				// 0 0 0 0 r7 r6 r5 r4
			r0 = _mm_packs_pu16(r0, mz);				// 0 0 0 0 r3 r2 r1 r0
			r1 = _mm_unpacklo_pi8(r1, mz);				// 0 r7 0 r6 0 r5 0 r4
			r0 = _mm_unpacklo_pi8(r0, mz);

			r1 = _mm_srli_pi16(r1, 3);					// 5 bits (0 - 4)
			r0 = _mm_srli_pi16(r0, 3);
			r1 = _mm_slli_pi16(r1, 11);					// 5 bits (11 - 15)
			r0 = _mm_slli_pi16(r0, 11);					// 5 bits (11 - 15)

			// Combine RGB.
			b0 = _mm_or_si64(g0, b0);
			b0 = _mm_or_si64(r0, b0);					// 16 bits rgb
			b1 = _mm_or_si64(g1, b1);
			b1 = _mm_or_si64(r1, b1);					// 16 bits rgb

			// Write out row 1.
			((PM64) (pbDstX + iDstXStride))[0] = b0;
			((PM64) (pbDstX + iDstXStride))[1] = b1;

			pbDstX += 16;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[1]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b);

			pbDstX += 4;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Пример #5
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB24.
VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 1
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 1.
			*((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[1]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Пример #6
0
 static inline m64 _load_alpha(uint x, const m64 &mmx_0x0000) {
     m64 t = _mm_unpacklo_pi8(_mm_cvtsi32_si64(x), mmx_0x0000);
     return _mm_shuffle_pi16(t, _MM_SHUFFLE(0, 0, 0, 0));
 }
Пример #7
0
__m64 test_mm_cvtsi32_si64(int a) {
  // CHECK-LABEL: test_mm_cvtsi32_si64
  // CHECK: insertelement <2 x i32>
  return _mm_cvtsi32_si64(a);
}