Beispiel #1
0
void r_dimpatchD_MMX(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h)
{
	int x, y, i;
	argb_t *line;
	int invAlpha = 256 - alpha;

	int dpitch = cvs->pitch / sizeof(DWORD);
	line = (argb_t *)cvs->buffer + y1 * dpitch;

	int batches = w / 2;
	int remainder = w & 1;

	// MMX temporaries:
	const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff);
	const __m64 blendAlpha = _mm_set_pi16(0, alpha, alpha, alpha);
	const __m64 blendInvAlpha = _mm_set_pi16(0, invAlpha, invAlpha, invAlpha);
	const __m64 blendColor = _mm_set_pi16(0, RPART(color), GPART(color), BPART(color));
	const __m64 blendMult = _mm_mullo_pi16(blendColor, blendAlpha);

	for (y = y1; y < y1 + h; y++)
	{
		// MMX optimize the bulk in batches of 2 colors:
		for (i = 0, x = x1; i < batches; ++i, x += 2)
		{
#if 1
			const __m64 input = _mm_setr_pi32(line[x + 0], line[x + 1]);
#else
			// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
			const __m64 input = *((__m64 *)line[x]);
#endif
			const __m64 output = blend2vs1_mmx(input, blendMult, blendInvAlpha, upper8mask);
#if 1
			line[x+0] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*0));
			line[x+1] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*1));
#else
			// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
			*((__m64 *)line[x]) = output;
#endif
		}

		if (remainder)
		{
			// Pick up the remainder:
			for (; x < x1 + w; x++)
			{
				line[x] = alphablend1a(line[x], color, alpha);
			}
		}

		line += dpitch;
	}

	// Required to reset FP:
	_mm_empty();
}
Beispiel #2
0
__m64
unsigned_add3 (const __m64 * a, const __m64 * b,
	       __m64 * result, unsigned int count)
{
  __m64 _a, _b, one, sum, carry, onesCarry;

  unsigned int i;

  carry = _mm_setzero_si64 ();

  one = _mm_cmpeq_pi8 (carry, carry);
  one = _mm_sub_si64 (carry, one);

  for (i = 0; i < count; i++)
    {
      _a = a[i];
      _b = b[i];

      sum = _mm_add_si64 (_a, _b);
      sum = _mm_add_si64 (sum, carry);

      result[i] = sum;

      onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry);
      onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry);
      onesCarry = _mm_and_si64 (onesCarry, one);

      _a = _mm_srli_si64 (_a, 1);
      _b = _mm_srli_si64 (_b, 1);

      carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry);
      carry = _mm_srli_si64 (carry, 63);
    }

  return carry;
}
Beispiel #3
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YV12 to RGB24.
VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride,
					 PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride,
					 UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcYDif;
	INT iSrcUvDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbDstX += (iHeight - 1) * iDstXStride;
		iDstXStride = -iDstXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcYDif = iSrcYStride - uWidth;
	iSrcUvDif = iSrcUvStride - (uWidth / 2);

	mz = _mm_setzero_si64();
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			// Calculate coefficient.
			u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU));	// [	| u3 u2 u1 u0]
			v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV));	// [	| v3 v2 v1 v0]

			u0 = _mm_unpacklo_pi8(u0, mz);				// u3 u2 u1 u0
			v0 = _mm_unpacklo_pi8(v0, mz);				// v3 v2 v1 v0
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y0 = *((PM64) pbSrcY);						// [YYYY | YYYY]; row 0
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			// Process for row 1.
			y0 = *((PM64) (pbSrcY + iSrcYStride));		// [YYYY | YYYY]; row 1
			y1 = _mm_unpackhi_pi8(y0, mz);				// y7 y6 y5 y4
			y0 = _mm_unpacklo_pi8(y0, mz);				// y3 y2 y1 y0
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 1.
			*((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcY += 8;
			pbSrcU += 4;
			pbSrcV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcU[0]];
			guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]];
			rv = g_iRVTab[pbSrcV[0]];

			yy = g_iYYTab[pbSrcY[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[1]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride]];
			pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcY[iSrcYStride + 1]];
			pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcY += 2;
			pbSrcU++;
			pbSrcV++;
		}

		pbDstX += iDstXDif + iDstXStride;
		pbSrcY += iSrcYDif + iSrcYStride;
		pbSrcU += iSrcUvDif;
		pbSrcV += iSrcUvDif;
	}

	_mm_empty();
}
Beispiel #4
0
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga)
{
	// SSE2 temporaries:
	const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff);
	const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga);
	const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga);

#if 1
	const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]);
#else
	const __m64 bgColors01 = *((__m64 *)&dest[0]);
#endif
	const __m64 fgColors01 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[0]),
		rt_mapcolor<argb_t>(dcol.colormap, source[1])
	);

	const __m64 finalColors01 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		)
	);

#if 1
	const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]);
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	const __m64 bgColors23 = *((__m64 *)&dest[2]);
#endif
	const __m64 fgColors23 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[2]),
		rt_mapcolor<argb_t>(dcol.colormap, source[3])
	);

	const __m64 finalColors23 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		)
	);
	
#if 1
	dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0));
	dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1));
	dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0));
	dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1));
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	*((__m64 *)&dest[0]) = finalColors01;
	*((__m64 *)&dest[2]) = finalColors23;
#endif

	// Required to reset FP:
	_mm_empty();
}
Beispiel #5
0
__m64 test72(__m64 a) {
  // CHECK: psrlq
  return _mm_srli_si64(a, 3);
}
Beispiel #6
0
void
mlib_m_ImageFlipY_U8_1(
    mlib_image *dst,
    const mlib_image *src)
{
	PREPAREVARS(mlib_u8, 1);

	if (sw < 16) {
		STRIP_1(ddata, sdata, sw, sh, mlib_u8);

		return;
	}

	for (i = 0; i < sh; i++) {
		mlib_u8 *psrc_row = sdata + i * sstride + sw - 1;
		mlib_u8 *pdst_row = ddata + i * dstride;
		__m64 s0, s1, d0, d1;

		s0 = *(__m64 *) (psrc_row - 7);
		s1 = *(__m64 *) (psrc_row - 15);

		for (j = 0; j <= (sw - 32); j += 16) {
			d0 = _mm_unpacklo_pi8(_mm_srli_si64(s0, 32), s0);
			d1 = _mm_unpacklo_pi8(_mm_srli_si64(s1, 32), s1);
			d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0);
			d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1);
			d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0);
			d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1);

			s0 = *(__m64 *) (psrc_row - 23);
			s1 = *(__m64 *) (psrc_row - 31);

			*(__m64 *) (pdst_row) = d0;
			*(__m64 *) (pdst_row + 8) = d1;

			psrc_row -= 16;
			pdst_row += 16;
		}

		if (j <= (sw - 16)) {
			d0 = _mm_unpacklo_pi8(_mm_srli_si64(s0, 32), s0);
			d1 = _mm_unpacklo_pi8(_mm_srli_si64(s1, 32), s1);
			d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0);
			d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1);
			d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0);
			d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1);

			*(__m64 *) (pdst_row) = d0;
			*(__m64 *) (pdst_row + 8) = d1;

			psrc_row -= 16;
			pdst_row += 16;
			j += 16;
		}

		for (; j < sw; j++) {
			(*pdst_row++) = *psrc_row--;
		}
	}

	_mm_empty();
}
Beispiel #7
0
void
mlib_m_ImageFlipY_S16_1(
    mlib_image *dst,
    const mlib_image *src)
{
	PREPAREVARS(mlib_u16, 1);

	if (sw < 8) {
		STRIP_1(ddata, sdata, sw, sh, mlib_u16);

		return;
	}

	sdata += sw - 1;

	for (i = 0; i < sh; i++) {
		mlib_u16 *psrc_row = sdata;
		mlib_u16 *pdst_row = ddata;
		__m64 s0, s1, d0, d1;

		s0 = *(__m64 *) (psrc_row - 3);
		s1 = *(__m64 *) (psrc_row - 7);

		for (j = 0; j <= (sw - 16); j += 8) {
			d0 = _mm_unpacklo_pi16(_mm_srli_si64(s0, 32), s0);
			d1 = _mm_unpacklo_pi16(_mm_srli_si64(s1, 32), s1);
			d0 = _mm_unpacklo_pi16(_mm_srli_si64(d0, 32), d0);
			d1 = _mm_unpacklo_pi16(_mm_srli_si64(d1, 32), d1);

			s0 = *(__m64 *) (psrc_row - 11);
			s1 = *(__m64 *) (psrc_row - 15);

			*(__m64 *) (pdst_row) = d0;
			*(__m64 *) (pdst_row + 4) = d1;

			psrc_row -= 8;
			pdst_row += 8;
		}

		if (j <= (sw - 8)) {
			d0 = _mm_unpacklo_pi16(_mm_srli_si64(s0, 32), s0);
			d1 = _mm_unpacklo_pi16(_mm_srli_si64(s1, 32), s1);
			d0 = _mm_unpacklo_pi16(_mm_srli_si64(d0, 32), d0);
			d1 = _mm_unpacklo_pi16(_mm_srli_si64(d1, 32), d1);

			*(__m64 *) (pdst_row) = d0;
			*(__m64 *) (pdst_row + 4) = d1;

			psrc_row -= 8;
			pdst_row += 8;
			j += 8;
		}

		for (; j < sw; j++) {
			(*pdst_row++) = *psrc_row--;
		}

		sdata = (void *)((mlib_u8 *)sdata + sstride);
		ddata = (void *)((mlib_u8 *)ddata + dstride);
	}

	_mm_empty();
}
Beispiel #8
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YUY2 to RGB24.
VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iDstXDif;
	INT iSrcXDif;
	INT yy, bu, guv, rv;

	M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz;
	M64 r0, g0, b0, r1, g1, b1;
	M64 rgb0, rgb1, rgb2, rgb3;
	M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbSrcX += (iHeight - 1) * iSrcXStride;
		iSrcXStride = -iSrcXStride;
	}

	iDstXDif = iDstXStride - (uWidth * 3);
	iSrcXDif = iSrcXStride - (uWidth * 2);

	mz = _mm_setzero_si64();
	for (y = iHeight; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			y0 = ((PM64) pbSrcX)[0];
			y1 = ((PM64) pbSrcX)[1];

			u0 = y0;
			v0 = y1;

			y0 = _mm_and_si64(y0, g_mWord00FF);
			y1 = _mm_and_si64(y1, g_mWord00FF);

			u0 = _mm_srli_pi16(u0, 8);
			v0 = _mm_srli_pi16(v0, 8);

			uv_temp1 = _mm_srli_pi32(u0, 16);
			u0 = _mm_slli_pi32(u0, 16);
			u0 = _mm_srli_pi32(u0, 16);

			uv_temp2 = _mm_srli_pi32(v0, 16);
			v0 = _mm_slli_pi32(v0, 16);
			v0 = _mm_srli_pi32(v0, 16);

			u0 = _mm_packs_pi32(u0, v0);
			v0 = _mm_packs_pi32(uv_temp1, uv_temp2);
			// Calculate coefficient.
			u0 = _mm_subs_pi16(u0, g_mSub80);
			v0 = _mm_subs_pi16(v0, g_mSub80);

			gu0 = _mm_mullo_pi16(u0, g_mUGMul);
			gv0 = _mm_mullo_pi16(v0, g_mVGMul);
			bu0 = _mm_mullo_pi16(u0, g_mUBMul);
			rv0 = _mm_mullo_pi16(v0, g_mVRMul);

			guv0 = _mm_adds_pi16(gu0, gv0);

			guv1 = _mm_unpackhi_pi16(guv0, guv0);		// guv3 guv3 guv2 guv2
			guv0 = _mm_unpacklo_pi16(guv0, guv0);		// guv1 guv1 guv0 guv0

			bu1 = _mm_unpackhi_pi16(bu0, bu0);			// bu3 bu3 bu2 bu2
			bu0 = _mm_unpacklo_pi16(bu0, bu0);			// bu1 bu1 bu0 bu0
			rv1 = _mm_unpackhi_pi16(rv0, rv0);			// rv3 rv3 rv2 rv2
			rv0 = _mm_unpacklo_pi16(rv0, rv0);			// rv1 rv1 rv0 rv0

			// Process for row 0.
			y1 = _mm_subs_pi16(y1, g_mSub10);
			y0 = _mm_subs_pi16(y0, g_mSub10);
			y1 = _mm_mullo_pi16(y1, g_mYYMul);
			y0 = _mm_mullo_pi16(y0, g_mYYMul);

			g1 = _mm_subs_pi16(y1, guv1);				// g7 g6 g5 g4
			g0 = _mm_subs_pi16(y0, guv0);				// g3 g2 g1 g0
			g1 = _mm_srai_pi16(g1, SCALEBITS);
			g0 = _mm_srai_pi16(g0, SCALEBITS);
			g0 = _mm_packs_pu16(g0, g1);				// g7 g6 ...g1 g0

			b1 = _mm_adds_pi16(y1, bu1);
			b0 = _mm_adds_pi16(y0, bu0);
			b1 = _mm_srai_pi16(b1, SCALEBITS);
			b0 = _mm_srai_pi16(b0, SCALEBITS);
			b0 = _mm_packs_pu16(b0, b1);

			r1 = _mm_adds_pi16(y1, rv1);
			r0 = _mm_adds_pi16(y0, rv0);
			r1 = _mm_srai_pi16(r1, SCALEBITS);
			r0 = _mm_srai_pi16(r0, SCALEBITS);
			r0 = _mm_packs_pu16(r0, r1);

			r1 = _mm_unpackhi_pi8(b0, r0);				// r7 b7 r6 b6 r5 b5 r4 b4
			r0 = _mm_unpacklo_pi8(b0, r0);				// r3 b3 r2 b2 r1 b1 r0 b0

			g1 = _mm_unpackhi_pi8(g0, mz);				// 0 g7 0 g6 0 g5 0 g4
			g0 = _mm_unpacklo_pi8(g0, mz);				// 0 g3 0 g2 0 g1 0 g0

			rgb0 = _mm_unpacklo_pi8(r0, g0);			// 0 r1 g1 b1 0 r0 g0 b0
			rgb1 = _mm_unpackhi_pi8(r0, g0);			// 0 r3 g3 b3 0 r2 g2 b2
			rgb2 = _mm_unpacklo_pi8(r1, g1);			// 0 r5 g5 b5 0 r4 g4 b4
			rgb3 = _mm_unpackhi_pi8(r1, g1);			// 0 r7 g7 b7 0 r6 g6 b6

			// Write out row 0.
			*((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32);
			*((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0);
			*((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32);
			*((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1);
			*((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32);
			*((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2);
			*((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32);
			*((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3);

			pbDstX += 24;
			pbSrcX += 16;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			bu = g_iBUTab[pbSrcX[1]];
			guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]];
			rv = g_iRVTab[pbSrcX[3]];

			yy = g_iYYTab[pbSrcX[0]];
			pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT);

			yy = g_iYYTab[pbSrcX[2]];
			pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT);
			pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT);
			pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT);

			pbDstX += 6;
			pbSrcX += 4;
		}

		pbDstX += iDstXDif;
		pbSrcX += iSrcXDif;
	}

	_mm_empty();
}
Beispiel #9
0
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output)
{
	__m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0);
	__m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255);
	const uint8_t *orig_input = input;
	uint8_t *y_comp = output;
	uint8_t *u_comp = output + width * height;
	uint8_t *v_comp = u_comp + (int)((width * height)/2);	// 4:2:2
	int i, j;

	// When preparing video for PAL DV50 encoding, the video must be shifted
	// down by one line to change the field order to be bottom-field-first
	int start_line = 0;
	if (shift_picture_down) {
		memset(y_comp, 0x10, width);		// write one line of black Y
		y_comp += width;
		memset(u_comp, 0x80, width/2);		// write one line of black U,V
		u_comp += width/2;
		memset(v_comp, 0x80, width/2);		// write one line of black U,V
		v_comp += width/2;
		start_line = 1;
	}

	/* Do the y component */
	for (j = start_line; j < height; j++)
	{
		// Consume 16 bytes of UYVY data per iteration (8 pixels worth)
		for (i = 0; i < width*2; i += 16)
		{
			//__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask);
			//__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask);
			//__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0);
			//*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1);
			__m64 m0 = *(__m64 *)input;
			__m64 m2 = _mm_srli_si64(m0, 8);
			__m64 m3 = _mm_slli_si64(m0, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m0 = m2;
			__m64 m1 = *(__m64 *)(input+8);
			m2 = _mm_srli_si64(m1, 8);
			m3 = _mm_slli_si64(m1, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m1 = m2;
			*(__m64 *)y_comp = _mm_packs_pu16 (m0, m1);

			y_comp += 8;
			input += 16;
		}
	}
	/* Do the chroma components */
	input = orig_input;
	for (j = start_line; j < height; j++)
	{
		/* Process every line for yuv 4:2:2 */
		for (i = 0; i < width*2; i += 16)
		{
			__m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8));
			__m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8));

			__m64 m3 = _mm_unpacklo_pi8 (m1, m2);
			__m64 m4 = _mm_unpackhi_pi8 (m1, m2);
			//*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2);
			//*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2);
			memcpy (u_comp, &m3, 4);
			memcpy (v_comp, &m4, 4);
			u_comp += 4;
			v_comp += 4;
			input += 16;
		}
	}
	_mm_empty();        // Clear aliased fp register state
}
Beispiel #10
0
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Convert YUY2 to YV12.
VOID Yuy2ToYv12_mmx(PBYTE pbDstY, PBYTE pbDstU, PBYTE pbDstV, INT iDstYStride, INT iDstUvStride,
					PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight)
{
	UINT x;
	INT y;
	INT iSrcXDif;
	INT iDstYDif;
	INT iDstUvDif;
	M64 m0, m1, m2, m3, m4, m5, m6, m7;

	if (iHeight < 0)
	{
		iHeight = -iHeight;
		pbSrcX += (iHeight - 1) * iSrcXStride;
		iSrcXStride = -iSrcXStride;
	}

	iSrcXDif = iSrcXStride - (uWidth * 2);
	iDstYDif = iDstYStride - uWidth;
	iDstUvDif = iDstUvStride - (uWidth / 2);

	m7 = g_mWord00FF;
	for (y = iHeight / 2; y; y--)
	{
		for (x = uWidth / 8; x; x--)
		{
			m0 = ((PM64) pbSrcX)[0];
			m1 = ((PM64) pbSrcX)[1];
			m2 = ((PM64) (pbSrcX + iSrcXStride))[0];
			m3 = ((PM64) (pbSrcX + iSrcXStride))[1];

			m4 = m0;
			m5 = m2;

			m4 = _mm_srli_pi16(m4, 8);
			m5 = _mm_srli_pi16(m5, 8);

			m4 = _mm_and_si64(m4, m7);
			m5 = _mm_and_si64(m5, m7);

			m4 = _mm_add_pi16(m4, m5);

			m5 = m1;
			m6 = m3;

			m5 = _mm_srli_pi16(m5, 8);
			m6 = _mm_srli_pi16(m6, 8);
			m5 = _mm_and_si64(m5, m7);
			m6 = _mm_and_si64(m6, m7);

			m5 = _mm_add_pi16(m5, m6);

			m4 = _mm_add_pi16(m4, g_mWord0001);
			m5 = _mm_add_pi16(m5, g_mWord0001);

			m4 = _mm_srli_pi16(m4, 1);
			m5 = _mm_srli_pi16(m5, 1);

			m0 = _mm_and_si64(m0, m7);
			m1 = _mm_and_si64(m1, m7);
			m2 = _mm_and_si64(m2, m7);
			m3 = _mm_and_si64(m3, m7);

			m0 = _mm_packs_pu16(m0, m1);
			m2 = _mm_packs_pu16(m2, m3);

			((PM64) pbDstY)[0] = m0;
			((PM64) (pbDstY + iDstYStride))[0] = m2;

			m4 = _mm_packs_pu16(m4, m5);
			m5 = m4;

			m4 = _mm_srli_si64(m4, 8);

			m5 = _mm_and_si64(m5, m7);
			m4 = _mm_and_si64(m4, m7);

			m5 = _mm_packs_pu16(m5, m5);
			m4 = _mm_packs_pu16(m4, m4);

			((PDWORD) pbDstU)[0] = _mm_cvtsi64_si32(m5);
			((PDWORD) pbDstV)[0] = _mm_cvtsi64_si32(m4);

			pbSrcX += 16;
			pbDstY += 8;
			pbDstU += 4;
			pbDstV += 4;
		}

		for (x = (uWidth & 7) / 2; x; x--)
		{
			pbDstY[0] = pbSrcX[0];
			pbDstU[0] = (pbSrcX[1] + pbSrcX[iSrcXStride + 1] + 1) / 2;
			pbDstY[1] = pbSrcX[2];
			pbDstV[0] = (pbSrcX[3] + pbSrcX[iSrcXStride + 3] + 1) / 2;

			pbDstY[iDstYStride + 0] = pbSrcX[iSrcXStride + 0];
			pbDstY[iDstYStride + 1] = pbSrcX[iSrcXStride + 2];

			pbSrcX += 4;
			pbDstY += 2;
			pbDstU++;
			pbDstV++;
		}

		pbSrcX += iSrcXDif + iSrcXStride;
		pbDstY += iDstYDif + iDstYStride;
		pbDstU += iDstUvDif;
		pbDstV += iDstUvDif;
	}

	_mm_empty();
}
Beispiel #11
0
void sha384Process(register sha384Param* sp)
{
	#ifdef OPTIMIZE_SSE2 
	
	# if defined(_MSC_VER) || defined (__INTEL_COMPILER)
	static const __m64 MASK = { 0x00FF00FF00FF00FF00 };
	# elif defined(__GNUC__)
	static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF };
	# else
	#  error
	# endif

	__m64 a, b, c, d, e, f, g, h, temp;
	register       __m64 *w;
	register const __m64 *k;
	register byte t;

	w = (__m64*) sp->data;
	t = 16;
	while (t--)
	{
		temp = *w;
		*(w++) = _m_pxor(
				_mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8),
				_m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27)
			);
	}

	t = 64;
	while (t--)
	{
		temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16]));
		*(w++) = temp;
	}

	w = (__m64*) sp->h;

	a = w[0]; b = w[1]; c = w[2]; d = w[3];
	e = w[4]; f = w[5]; g = w[6]; h = w[7];

	w = (__m64*) sp->data;
	k = (__m64*) SHA2_64BIT_K;

	#else

	register uint64_t a, b, c, d, e, f, g, h, temp;
	register       uint64_t *w;
	register const uint64_t *k;
	register byte t;

	# if WORDS_BIGENDIAN
	w = sp->data + 16;
	# else
	w = sp->data;
	t = 16;
	while (t--)
	{
		temp = swapu64(*w);
		*(w++) = temp;
	}
	# endif

	t = 64;
	while (t--)
	{
		temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16];
		*(w++) = temp;
	}

	w = sp->data;

	a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3];
	e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7];

	k = SHA2_64BIT_K;
	#endif

	ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]);
	ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]);
	ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]);
	ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]);
	ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]);
	ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]);
	ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]);
	ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]);
	ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]);
	ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]);
	ROUND(g,h,a,b,c,d,e,f,w[10],k[10]);
	ROUND(f,g,h,a,b,c,d,e,w[11],k[11]);
	ROUND(e,f,g,h,a,b,c,d,w[12],k[12]);
	ROUND(d,e,f,g,h,a,b,c,w[13],k[13]);
	ROUND(c,d,e,f,g,h,a,b,w[14],k[14]);
	ROUND(b,c,d,e,f,g,h,a,w[15],k[15]);
	ROUND(a,b,c,d,e,f,g,h,w[16],k[16]);
	ROUND(h,a,b,c,d,e,f,g,w[17],k[17]);
	ROUND(g,h,a,b,c,d,e,f,w[18],k[18]);
	ROUND(f,g,h,a,b,c,d,e,w[19],k[19]);
	ROUND(e,f,g,h,a,b,c,d,w[20],k[20]);
	ROUND(d,e,f,g,h,a,b,c,w[21],k[21]);
	ROUND(c,d,e,f,g,h,a,b,w[22],k[22]);
	ROUND(b,c,d,e,f,g,h,a,w[23],k[23]);
	ROUND(a,b,c,d,e,f,g,h,w[24],k[24]);
	ROUND(h,a,b,c,d,e,f,g,w[25],k[25]);
	ROUND(g,h,a,b,c,d,e,f,w[26],k[26]);
	ROUND(f,g,h,a,b,c,d,e,w[27],k[27]);
	ROUND(e,f,g,h,a,b,c,d,w[28],k[28]);
	ROUND(d,e,f,g,h,a,b,c,w[29],k[29]);
	ROUND(c,d,e,f,g,h,a,b,w[30],k[30]);
	ROUND(b,c,d,e,f,g,h,a,w[31],k[31]);
	ROUND(a,b,c,d,e,f,g,h,w[32],k[32]);
	ROUND(h,a,b,c,d,e,f,g,w[33],k[33]);
	ROUND(g,h,a,b,c,d,e,f,w[34],k[34]);
	ROUND(f,g,h,a,b,c,d,e,w[35],k[35]);
	ROUND(e,f,g,h,a,b,c,d,w[36],k[36]);
	ROUND(d,e,f,g,h,a,b,c,w[37],k[37]);
	ROUND(c,d,e,f,g,h,a,b,w[38],k[38]);
	ROUND(b,c,d,e,f,g,h,a,w[39],k[39]);
	ROUND(a,b,c,d,e,f,g,h,w[40],k[40]);
	ROUND(h,a,b,c,d,e,f,g,w[41],k[41]);
	ROUND(g,h,a,b,c,d,e,f,w[42],k[42]);
	ROUND(f,g,h,a,b,c,d,e,w[43],k[43]);
	ROUND(e,f,g,h,a,b,c,d,w[44],k[44]);
	ROUND(d,e,f,g,h,a,b,c,w[45],k[45]);
	ROUND(c,d,e,f,g,h,a,b,w[46],k[46]);
	ROUND(b,c,d,e,f,g,h,a,w[47],k[47]);
	ROUND(a,b,c,d,e,f,g,h,w[48],k[48]);
	ROUND(h,a,b,c,d,e,f,g,w[49],k[49]);
	ROUND(g,h,a,b,c,d,e,f,w[50],k[50]);
	ROUND(f,g,h,a,b,c,d,e,w[51],k[51]);
	ROUND(e,f,g,h,a,b,c,d,w[52],k[52]);
	ROUND(d,e,f,g,h,a,b,c,w[53],k[53]);
	ROUND(c,d,e,f,g,h,a,b,w[54],k[54]);
	ROUND(b,c,d,e,f,g,h,a,w[55],k[55]);
	ROUND(a,b,c,d,e,f,g,h,w[56],k[56]);
	ROUND(h,a,b,c,d,e,f,g,w[57],k[57]);
	ROUND(g,h,a,b,c,d,e,f,w[58],k[58]);
	ROUND(f,g,h,a,b,c,d,e,w[59],k[59]);
	ROUND(e,f,g,h,a,b,c,d,w[60],k[60]);
	ROUND(d,e,f,g,h,a,b,c,w[61],k[61]);
	ROUND(c,d,e,f,g,h,a,b,w[62],k[62]);
	ROUND(b,c,d,e,f,g,h,a,w[63],k[63]);
	ROUND(a,b,c,d,e,f,g,h,w[64],k[64]);
	ROUND(h,a,b,c,d,e,f,g,w[65],k[65]);
	ROUND(g,h,a,b,c,d,e,f,w[66],k[66]);
	ROUND(f,g,h,a,b,c,d,e,w[67],k[67]);
	ROUND(e,f,g,h,a,b,c,d,w[68],k[68]);
	ROUND(d,e,f,g,h,a,b,c,w[69],k[69]);
	ROUND(c,d,e,f,g,h,a,b,w[70],k[70]);
	ROUND(b,c,d,e,f,g,h,a,w[71],k[71]);
	ROUND(a,b,c,d,e,f,g,h,w[72],k[72]);
	ROUND(h,a,b,c,d,e,f,g,w[73],k[73]);
	ROUND(g,h,a,b,c,d,e,f,w[74],k[74]);
	ROUND(f,g,h,a,b,c,d,e,w[75],k[75]);
	ROUND(e,f,g,h,a,b,c,d,w[76],k[76]);
	ROUND(d,e,f,g,h,a,b,c,w[77],k[77]);
	ROUND(c,d,e,f,g,h,a,b,w[78],k[78]);
	ROUND(b,c,d,e,f,g,h,a,w[79],k[79]);

	#ifdef OPTIMIZE_SSE2
	w = (__m64*) sp->h;
	w[0] = _mm_add_si64(w[0], a);
	w[1] = _mm_add_si64(w[1], b);
	w[2] = _mm_add_si64(w[2], c);
	w[3] = _mm_add_si64(w[3], d);
	w[4] = _mm_add_si64(w[4], e);
	w[5] = _mm_add_si64(w[5], f);
	w[6] = _mm_add_si64(w[6], g);
	w[7] = _mm_add_si64(w[7], h);
	_mm_empty();
	#else
	sp->h[0] += a;
	sp->h[1] += b;
	sp->h[2] += c;
	sp->h[3] += d;
	sp->h[4] += e;
	sp->h[5] += f;
	sp->h[6] += g;
	sp->h[7] += h;
	#endif
}
Beispiel #12
0
void
mlib_m_ImageMaximum_U8_3(
    mlib_s32 *res32,
    const mlib_image *img)
{
/* src address */
	__m64 *sp, *sl;

/* src data */
	__m64 sd;

/* max values */
	__m64 max0, max1, max2, max3;

/* edge mask */
	mlib_s32 emask;

/* loop variables */
	mlib_s32 n1;

/* height of image */
	mlib_s32 height = mlib_ImageGetHeight(img);

/* elements to next row */
	mlib_s32 slb = mlib_ImageGetStride(img);
	mlib_s32 width = mlib_ImageGetWidth(img) * 3;

	mlib_u8 *dend;

	if (slb == width) {
		width *= height;
		height = 1;
	}

	sp = sl = (__m64 *) mlib_ImageGetData(img);

	max1 = _mm_set1_pi8(MLIB_U8_MIN);
	max2 = _mm_set1_pi8(MLIB_U8_MIN);
	max3 = _mm_set1_pi8(MLIB_U8_MIN);

	for (; height > 0; height--) {

		n1 = width;
		dend = (mlib_u8 *)sp + width;

		for (; n1 > 23; n1 -= 24) {
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8(max1, max1, sd);
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8(max2, max2, sd);
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8(max3, max3, sd);
		}

		if (n1 > 0) {
			emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1));
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8_M32(max1, max1, sd, emask);

			n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp);
			if (n1 > 0) {
				emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1));
				sd = (*sp++);
				MLIB_M_IMAGE_MAXIMUM_U8_M32(max2, max2, sd,
				    emask);

				n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp);
				if (n1 > 0) {
					emask = (0xFF << (8 - n1));
					sd = *sp;
					MLIB_M_IMAGE_MAXIMUM_U8_M32(max3, max3,
					    sd, emask);
				}
			}
		}

		sp = sl = (__m64 *) ((mlib_u8 *)sl + slb);
	}

	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max1, _mm_srli_si64(max2, 8),
	    mmx_write_64(0x00ffffffffffffffll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max2, 16),
	    mmx_write_64(0x0000000000ff0000ll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max3, 16),
	    mmx_write_64(0x0000ffffffffffffll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max3, 8),
	    mmx_write_64(0x0000000000ffff00ll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24),
	    mmx_write_64(0x000000ffff000000ll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24),
	    mmx_write_64(0x0000000000ffffffll));

	res32[0] = _mm_cvtsi64_si32(_mm_and_si64(max0,
				mmx_write_64(0x00000000000000ffll)));
	res32[1] =
	    _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 8),
	    mmx_write_64(0x00000000000000ffll)));
	res32[2] =
	    _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 16),
	    mmx_write_64(0x00000000000000ffll)));

	_mm_empty();
}
Beispiel #13
0
__m64 test_mm_srli_si64(__m64 a) {
  // CHECK-LABEL: test_mm_srli_si64
  // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q
  return _mm_srli_si64(a, 3);
}
Beispiel #14
0
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
{
   const unsigned char
      *u = src + ((y-1) & 7)*sc2lines_width,
      *m = src + ((y+0) & 7)*sc2lines_width,
      *l = src + ((y+1) & 7)*sc2lines_width;

   for (unsigned i = 0; i < nPix; i += 4) {

      if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) {

         __m64 mm = *(__m64*)(m+i-2);
         __m64 uu = *(__m64*)(u+i-2);
         __m64 ll = *(__m64*)(l+i-2);
         __m64 md = _mm_slli_si64(mm,8);
         __m64 mf = _mm_srli_si64(mm,8);
         __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll));

         __m64 e0, e1, v1, v2;

         e0 = _mm_cmpeq_pi8(md,uu);
         e0 = _mm_andnot_si64(maskall, e0);
         e0 = _mm_srli_si64(e0,16);
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());

         e1 = _mm_cmpeq_pi8(mf,uu);
         e1 = _mm_andnot_si64(maskall, e1);
         e1 = _mm_srli_si64(e1,16);
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);

         e0 = _mm_or_si64(e0, e1);

         v1 = _m_from_int(*(unsigned*)(m+i));
         v2 = _m_from_int(*(unsigned*)(u+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         v2 = _mm_unpacklo_pi8(v2,v2);

         *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );

         e0 = _mm_cmpeq_pi8(md,ll);
         e0 = _mm_andnot_si64(maskall, e0);
         e0 = _mm_srli_si64(e0,16);
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());

         e1 = _mm_cmpeq_pi8(mf,ll);
         e1 = _mm_andnot_si64(maskall, e1);
         e1 = _mm_srli_si64(e1,16);
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);

         e0 = _mm_or_si64(e0, e1);

         v1 = _m_from_int(*(unsigned*)(m+i));
         v2 = _m_from_int(*(unsigned*)(l+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         v2 = _mm_unpacklo_pi8(v2,v2);

         *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );

      } else {

         __m64 v1 = _m_from_int(*(unsigned*)(m+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         *(__m64*)(dst1 + 2*i) = v1;
         *(__m64*)(dst2 + 2*i) = v1;

      }

   }
}