Esempio n. 1
0
int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride,
 const unsigned char *ref, int rystride) {
  int32_t satd;
  __m64 sums;
  __m64 a;
  __m64 b;
  __m64 c;
  __m64 d;
  a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride);
  b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride);
  c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride);
  d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride);
  /*Vertical 1D transform.*/
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  od_transpose16x4(&a, &b, &c, &d);
  /*Horizontal 1D transform.*/
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the
     final butterfly stage with the calculating the absolute values and the
     first stage of accumulation.
    Calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
    An offset must be added to the final sum before rounding to account for
     subtracting 0x7FFF.*/
  a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b),
   _mm_set1_pi16(0x7FFF)));
  c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d),
   _mm_set1_pi16(0x7FFF)));
  /*Take the sum of all the absolute values.*/
  sums = _mm_add_pi16(a, c);
  /*Sum the elements of the vector.*/
  sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3)));
  sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1)));
  sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64());
  satd = _mm_cvtsi64_si32(sums);
  /*Subtract the offset (8) and round.*/
  satd = (satd + 1 - 8) >> 1;
#if defined(OD_CHECKASM)
  {
    int32_t c_satd;
    c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride);
    if (satd != c_satd) {
      fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n",
       4, 4, satd, c_satd);
    }
  }
#endif
  return satd;
}
Esempio n. 2
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB16.
VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv, r, g, b;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 2);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0

			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);			// 8 bits (0 - 7)
			b0 = _mm_srai_pi16(b0, SCALEBITS);

			b1 = _mm_packs_pu16(b1, mz);				// 0 0 0 0 b7 b6 b5 b4
			b0 = _mm_packs_pu16(b0, mz);				// 0 0 0 0 b3 b2 b1 b0
			b1 = _mm_unpacklo_pi8(b1, mz);				// 0 b7 0b 6 0 b5 0 b4
			b0 = _mm_unpacklo_pi8(b0, mz);

			b1 = _mm_srli_pi16(b1, 3);
			b0 = _mm_srli_pi16(b0, 3);					// 5 bits (0 - 4)

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);			// 8 bits (0 - 7)
			g0 = _mm_srai_pi16(g0, SCALEBITS);

			g1 = _mm_packs_pu16(g1, mz);				// 0 0 0 0 g7 g6 g5 g4
			g0 = _mm_packs_pu16(g0, mz);				// 0 0 0 0 g3 g2 g1 g0
			g1 = _mm_unpacklo_pi8(g1, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);

			g1 = _mm_srli_pi16(g1, 2);					// 6 bits (0 - 5)
			g0 = _mm_srli_pi16(g0, 2);
			g1 = _mm_slli_pi16(g1, 5);					// 6 bits (5 - 10)
			g0 = _mm_slli_pi16(g0, 5);					// 6 bits (5 - 10)

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);

			r1 = _mm_packs_pu16(r1, mz);				// 0 0 0 0 r7 r6 r5 r4
			r0 = _mm_packs_pu16(r0, mz);				// 0 0 0 0 r3 r2 r1 r0
			r1 = _mm_unpacklo_pi8(r1, mz);				// 0 r7 0 r6 0 r5 0 r4
			r0 = _mm_unpacklo_pi8(r0, mz);

			r1 = _mm_srli_pi16(r1, 3);					// 5 bits (0 - 4)
			r0 = _mm_srli_pi16(r0, 3);
			r1 = _mm_slli_pi16(r1, 11);					// 5 bits (11 - 15)
			r0 = _mm_slli_pi16(r0, 11);					// 5 bits (11 - 15)

			// Combine RGB.
			b0 = _mm_or_si64(g0, b0);
			b0 = _mm_or_si64(r0, b0);					// 16 bits rgb
			b1 = _mm_or_si64(g1, b1);
			b1 = _mm_or_si64(r1, b1);					// 16 bits rgb

			// Write out row 0.
			((PM64) pbDstX)[0] = b0;
			((PM64) pbDstX)[1] = b1;

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);			// 8 bits (0 - 7)
			b0 = _mm_srai_pi16(b0, SCALEBITS);

			b1 = _mm_packs_pu16(b1, mz);				// 0 0 0 0 b7 b6 b5 b4
			b0 = _mm_packs_pu16(b0, mz);				// 0 0 0 0 b3 b2 b1 b0
			b1 = _mm_unpacklo_pi8(b1, mz);				// 0 b7 0b 6 0 b5 0 b4
			b0 = _mm_unpacklo_pi8(b0, mz);

			b1 = _mm_srli_pi16(b1, 3);
			b0 = _mm_srli_pi16(b0, 3);					// 5 bits (0 - 4)

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);			// 8 bits (0 - 7)
			g0 = _mm_srai_pi16(g0, SCALEBITS);

			g1 = _mm_packs_pu16(g1, mz);				// 0 0 0 0 g7 g6 g5 g4
			g0 = _mm_packs_pu16(g0, mz);				// 0 0 0 0 g3 g2 g1 g0
			g1 = _mm_unpacklo_pi8(g1, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);

			g1 = _mm_srli_pi16(g1, 2);					// 6 bits (0 - 5)
			g0 = _mm_srli_pi16(g0, 2);
			g1 = _mm_slli_pi16(g1, 5);					// 6 bits (5 - 10)
			g0 = _mm_slli_pi16(g0, 5);					// 6 bits (5 - 10)

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);

			r1 = _mm_packs_pu16(r1, mz);				// 0 0 0 0 r7 r6 r5 r4
			r0 = _mm_packs_pu16(r0, mz);				// 0 0 0 0 r3 r2 r1 r0
			r1 = _mm_unpacklo_pi8(r1, mz);				// 0 r7 0 r6 0 r5 0 r4
			r0 = _mm_unpacklo_pi8(r0, mz);

			r1 = _mm_srli_pi16(r1, 3);					// 5 bits (0 - 4)
			r0 = _mm_srli_pi16(r0, 3);
			r1 = _mm_slli_pi16(r1, 11);					// 5 bits (11 - 15)
			r0 = _mm_slli_pi16(r0, 11);					// 5 bits (11 - 15)

			// Combine RGB.
			b0 = _mm_or_si64(g0, b0);
			b0 = _mm_or_si64(r0, b0);					// 16 bits rgb
			b1 = _mm_or_si64(g1, b1);
			b1 = _mm_or_si64(r1, b1);					// 16 bits rgb

			// Write out row 1.
			((PM64) (pbDstX + iDstXStride))[0] = b0;
			((PM64) (pbDstX + iDstXStride))[1] = b1;

			pbDstX += 16;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[1]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			b = _Clip(((yy + bu) >> SCALEBITS_OUT));
			g = _Clip(((yy - guv) >> SCALEBITS_OUT));
			r = _Clip(((yy + rv) >> SCALEBITS_OUT));
			((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b);

			pbDstX += 4;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Esempio n. 3
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB24.
VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 1
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 1.
			*((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[1]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Esempio n. 4
0
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga)
{
	// SSE2 temporaries:
	const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff);
	const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga);
	const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga);

#if 1
	const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]);
#else
	const __m64 bgColors01 = *((__m64 *)&dest[0]);
#endif
	const __m64 fgColors01 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[0]),
		rt_mapcolor<argb_t>(dcol.colormap, source[1])
	);

	const __m64 finalColors01 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		)
	);

#if 1
	const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]);
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	const __m64 bgColors23 = *((__m64 *)&dest[2]);
#endif
	const __m64 fgColors23 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[2]),
		rt_mapcolor<argb_t>(dcol.colormap, source[3])
	);

	const __m64 finalColors23 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		)
	);
	
#if 1
	dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0));
	dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1));
	dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0));
	dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1));
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	*((__m64 *)&dest[0]) = finalColors01;
	*((__m64 *)&dest[2]) = finalColors23;
#endif

	// Required to reset FP:
	_mm_empty();
}
Esempio n. 5
0
__m64 test38(__m64 a, __m64 b) {
  // CHECK: paddsw
  return _mm_adds_pi16(a, b);
}
Esempio n. 6
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YUY2 to RGB24.
VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcXDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbSrcX += (iHeight - 1) * iSrcXStride;
		iSrcXStride = -iSrcXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcXDif = iSrcXStride - (uWidth * 2);

	mz = _mm_setzero_si64();
	for (y = iHeight; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			y0 = ((PM64) pbSrcX)[0];
			y1 = ((PM64) pbSrcX)[1];

			u0 = y0;
			v0 = y1;

			y0 = _mm_and_si64(y0, g_mWord00FF);
			y1 = _mm_and_si64(y1, g_mWord00FF);

			u0 = _mm_srli_pi16(u0, 8);
			v0 = _mm_srli_pi16(v0, 8);

			uv_temp1 = _mm_srli_pi32(u0, 16);
			u0 = _mm_slli_pi32(u0, 16);
			u0 = _mm_srli_pi32(u0, 16);

			uv_temp2 = _mm_srli_pi32(v0, 16);
			v0 = _mm_slli_pi32(v0, 16);
			v0 = _mm_srli_pi32(v0, 16);

			u0 = _mm_packs_pi32(u0, v0);
			v0 = _mm_packs_pi32(uv_temp1, uv_temp2);
			// Calculate coefficient.
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcX += 16;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcX[1]];
			guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]];
			rv = g_iRVTab[pbSrcX[3]];

			yy = g_iYYTab[pbSrcX[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcX[2]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcX += 4;
		}

		pbDstX += iDstXDif;
		pbSrcX += iSrcXDif;
	}

	_mm_empty();
}
Esempio n. 7
0
void AudioMixer::addBufferToMixForListeningNodeWithBuffer(PositionalAudioRingBuffer* bufferToAdd,
                                                          AvatarAudioRingBuffer* listeningNodeBuffer) {
    float bearingRelativeAngleToSource = 0.0f;
    float attenuationCoefficient = 1.0f;
    int numSamplesDelay = 0;
    float weakChannelAmplitudeRatio = 1.0f;
    
    if (bufferToAdd != listeningNodeBuffer) {
        // if the two buffer pointers do not match then these are different buffers

        glm::vec3 relativePosition = bufferToAdd->getPosition() - listeningNodeBuffer->getPosition();
        glm::quat inverseOrientation = glm::inverse(listeningNodeBuffer->getOrientation());

        float distanceSquareToSource = glm::dot(relativePosition, relativePosition);
        float radius = 0.0f;

        if (bufferToAdd->getType() == PositionalAudioRingBuffer::Injector) {
            InjectedAudioRingBuffer* injectedBuffer = (InjectedAudioRingBuffer*) bufferToAdd;
            radius = injectedBuffer->getRadius();
            attenuationCoefficient *= injectedBuffer->getAttenuationRatio();
        }

        if (radius == 0 || (distanceSquareToSource > radius * radius)) {
            // this is either not a spherical source, or the listener is outside the sphere

            if (radius > 0) {
                // this is a spherical source - the distance used for the coefficient
                // needs to be the closest point on the boundary to the source

                // ovveride the distance to the node with the distance to the point on the
                // boundary of the sphere
                distanceSquareToSource -= (radius * radius);

            } else {
                // calculate the angle delivery for off-axis attenuation
                glm::vec3 rotatedListenerPosition = glm::inverse(bufferToAdd->getOrientation()) * relativePosition;

                float angleOfDelivery = glm::angle(glm::vec3(0.0f, 0.0f, -1.0f),
                                                   glm::normalize(rotatedListenerPosition));

                const float MAX_OFF_AXIS_ATTENUATION = 0.2f;
                const float OFF_AXIS_ATTENUATION_FORMULA_STEP = (1 - MAX_OFF_AXIS_ATTENUATION) / 2.0f;

                float offAxisCoefficient = MAX_OFF_AXIS_ATTENUATION +
                    (OFF_AXIS_ATTENUATION_FORMULA_STEP * (angleOfDelivery / PI_OVER_TWO));

                // multiply the current attenuation coefficient by the calculated off axis coefficient
                attenuationCoefficient *= offAxisCoefficient;
            }

            glm::vec3 rotatedSourcePosition = inverseOrientation * relativePosition;

            const float DISTANCE_SCALE = 2.5f;
            const float GEOMETRIC_AMPLITUDE_SCALAR = 0.3f;
            const float DISTANCE_LOG_BASE = 2.5f;
            const float DISTANCE_SCALE_LOG = logf(DISTANCE_SCALE) / logf(DISTANCE_LOG_BASE);

            // calculate the distance coefficient using the distance to this node
            float distanceCoefficient = powf(GEOMETRIC_AMPLITUDE_SCALAR,
                                             DISTANCE_SCALE_LOG +
                                             (0.5f * logf(distanceSquareToSource) / logf(DISTANCE_LOG_BASE)) - 1);
            distanceCoefficient = std::min(1.0f, distanceCoefficient);

            // multiply the current attenuation coefficient by the distance coefficient
            attenuationCoefficient *= distanceCoefficient;

            // project the rotated source position vector onto the XZ plane
            rotatedSourcePosition.y = 0.0f;

            // produce an oriented angle about the y-axis
            bearingRelativeAngleToSource = glm::orientedAngle(glm::vec3(0.0f, 0.0f, -1.0f),
                                                              glm::normalize(rotatedSourcePosition),
                                                              glm::vec3(0.0f, 1.0f, 0.0f));

            const float PHASE_AMPLITUDE_RATIO_AT_90 = 0.5;

            // figure out the number of samples of delay and the ratio of the amplitude
            // in the weak channel for audio spatialization
            float sinRatio = fabsf(sinf(bearingRelativeAngleToSource));
            numSamplesDelay = SAMPLE_PHASE_DELAY_AT_90 * sinRatio;
            weakChannelAmplitudeRatio = 1 - (PHASE_AMPLITUDE_RATIO_AT_90 * sinRatio);
        }
    }

    // if the bearing relative angle to source is > 0 then the delayed channel is the right one
    int delayedChannelOffset = (bearingRelativeAngleToSource > 0.0f) ? 1 : 0;
    int goodChannelOffset = delayedChannelOffset == 0 ? 1 : 0;
    
    const int16_t* nextOutputStart = bufferToAdd->getNextOutput();
   
    const int16_t* bufferStart = bufferToAdd->getBuffer();
    int ringBufferSampleCapacity = bufferToAdd->getSampleCapacity();

    int16_t correctBufferSample[2], delayBufferSample[2];
    int delayedChannelIndex = 0;
    
    const int SINGLE_STEREO_OFFSET = 2;
    
    for (int s = 0; s < NETWORK_BUFFER_LENGTH_SAMPLES_STEREO; s += 4) {
        
        // setup the int16_t variables for the two sample sets
        correctBufferSample[0] = nextOutputStart[s / 2] * attenuationCoefficient;
        correctBufferSample[1] = nextOutputStart[(s / 2) + 1] * attenuationCoefficient;
        
        delayedChannelIndex = s + (numSamplesDelay * 2) + delayedChannelOffset;
        
        delayBufferSample[0] = correctBufferSample[0] * weakChannelAmplitudeRatio;
        delayBufferSample[1] = correctBufferSample[1] * weakChannelAmplitudeRatio;
        
        __m64 bufferSamples = _mm_set_pi16(_clientSamples[s + goodChannelOffset],
                                           _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET],
                                           _clientSamples[delayedChannelIndex],
                                           _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET]);
        __m64 addedSamples = _mm_set_pi16(correctBufferSample[0], correctBufferSample[1],
                                         delayBufferSample[0], delayBufferSample[1]);
        
        // perform the MMX add (with saturation) of two correct and delayed samples
        __m64 mmxResult = _mm_adds_pi16(bufferSamples, addedSamples);
        int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
        
        // assign the results from the result of the mmx arithmetic
        _clientSamples[s + goodChannelOffset] = shortResults[3];
        _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET] = shortResults[2];
        _clientSamples[delayedChannelIndex] = shortResults[1];
        _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET] = shortResults[0];
    }
    
    // The following code is pretty gross and redundant, but AFAIK it's the best way to avoid
    // too many conditionals in handling the delay samples at the beginning of _clientSamples.
    // Basically we try to take the samples in batches of four, and then handle the remainder
    // conditionally to get rid of the rest.
    
    const int DOUBLE_STEREO_OFFSET = 4;
    const int TRIPLE_STEREO_OFFSET = 6;
    
    if (numSamplesDelay > 0) {
        // if there was a sample delay for this buffer, we need to pull samples prior to the nextOutput
        // to stick at the beginning
        float attenuationAndWeakChannelRatio = attenuationCoefficient * weakChannelAmplitudeRatio;
        const int16_t* delayNextOutputStart = nextOutputStart - numSamplesDelay;
        if (delayNextOutputStart < bufferStart) {
            delayNextOutputStart = bufferStart + ringBufferSampleCapacity - numSamplesDelay;
        }
        
        int i = 0;
        
        while (i + 3 < numSamplesDelay) {
            // handle the first cases where we can MMX add four samples at once
            int parentIndex = i * 2;
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset],
                                               _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset],
                                               _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset],
                                               _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset]);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 3] * attenuationAndWeakChannelRatio);
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
            _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2];
            _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1];
            _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[0];
            
            // push the index
            i += 4;
        }
        
        int parentIndex = i * 2;
        
        if (i + 2 < numSamplesDelay) {
            // MMX add only three delayed samples
            
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset],
                                               _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset],
                                               _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset],
                                               0);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio,
                                            0);
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
            _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2];
            _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1];
            
        } else if (i + 1 < numSamplesDelay) {
            // MMX add two delayed samples
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset],
                                               _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], 0, 0);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, 0, 0);
            
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
            _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2];
            
        } else if (i < numSamplesDelay) {
            // MMX add a single delayed sample
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], 0, 0, 0);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, 0, 0, 0);
            
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
        }
    }
}
Esempio n. 8
0
void reverb::comb_allpass4(signed short *sp,
														signed short *dp,
														const comb_param &comb_delay,
														const int comb_gain,
														const int allpass_delay,
														const int allpass_gain,
														const int *rvol,
														const unsigned int sz)
{
#ifdef use_intrinsics
	__m64   cg=_mm_set1_pi16(comb_gain),
				ag=_mm_set1_pi16(allpass_gain),
				rv[2];
	rv[0]=_mm_set1_pi16(rvol[0]);
	rv[1]=_mm_set1_pi16(rvol[1]);

	for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2)
	{
		__m64 dv[2];

		for (int c=0; c<2; c++)
		{
			// Comb

			__m64 v=_mm_setzero_si64();

			for (int f=0; f<4; f++)
			{
				int yck=(yp-comb_delay[c][f])&(max_delay-1);
				__m64 xv=*(__m64 *)(&x[c][yck]),
							yv=*(__m64 *)(&y[c][f][yck]);
				yv=_mm_mulhi_pi16(yv,cg);
				yv=_mm_adds_pi16(yv,yv);
				yv=_mm_adds_pi16(xv,yv);
				*((__m64 *)&y[c][f][yp])=yv;
				yv=_mm_srai_pi16(yv,2);
				v=_mm_adds_pi16(v,yv);
			}

			// Allpass

			if (allpass_delay)
			{
				*((__m64 *)&ax[c][yp])=v;

				int ypa=(yp-allpass_delay)&(max_delay-1);
				__m64 ayv=*(__m64 *)&ay[c][ypa],
								xv=*(__m64 *)&x[c][yp],
								axv=*(__m64 *)&ax[c][ypa];

				ayv=_mm_subs_pi16(ayv,xv);
				ayv=_mm_mulhi_pi16(ayv,ag);
				ayv=_mm_adds_pi16(ayv,ayv);
				v=_mm_adds_pi16(ayv,axv);
				*((__m64 *)&ay[c][yp])=v;
			}

			// Output

			dv[c]=_mm_mulhi_pi16(v,rv[c]);
			dv[c]=_mm_adds_pi16(dv[c],dv[c]);
		}

		__m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]),
					dv2=_mm_unpackhi_pi16(dv[0],dv[1]),
					d1=*(__m64 *)&dp[0],
					d2=*(__m64 *)&dp[4],
					s1=*(__m64 *)&sp[0],
					s2=*(__m64 *)&sp[4];
		d1=_mm_adds_pi16(d1,s1);
		d2=_mm_adds_pi16(d2,s2);
		d1=_mm_adds_pi16(d1,dv1);
		d2=_mm_adds_pi16(d2,dv2);
		*(__m64 *)&dp[0]=d1;
		*(__m64 *)&dp[4]=d2;

		yp=(yp+4)&(max_delay-1);
	}

	_mm_empty();
#endif
}
Esempio n. 9
0
__m64 test_mm_adds_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_adds_pi16
  // CHECK: call x86_mmx @llvm.x86.mmx.padds.w
  return _mm_adds_pi16(a, b);
}