コード例 #1
0
ファイル: mmx-builtins.c プロジェクト: nightwishud/accmut
__m64 test69(__m64 a) {
  // CHECK: psllq
  return _mm_slli_si64(a, 3);
}
コード例 #2
0
ファイル: video_conversion.c プロジェクト: UIKit0/bbc-ingex
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output)
{
	__m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0);
	__m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255);
	const uint8_t *orig_input = input;
	uint8_t *y_comp = output;
	uint8_t *u_comp = output + width * height;
	uint8_t *v_comp = u_comp + (int)((width * height)/2);	// 4:2:2
	int i, j;

	// When preparing video for PAL DV50 encoding, the video must be shifted
	// down by one line to change the field order to be bottom-field-first
	int start_line = 0;
	if (shift_picture_down) {
		memset(y_comp, 0x10, width);		// write one line of black Y
		y_comp += width;
		memset(u_comp, 0x80, width/2);		// write one line of black U,V
		u_comp += width/2;
		memset(v_comp, 0x80, width/2);		// write one line of black U,V
		v_comp += width/2;
		start_line = 1;
	}

	/* Do the y component */
	for (j = start_line; j < height; j++)
	{
		// Consume 16 bytes of UYVY data per iteration (8 pixels worth)
		for (i = 0; i < width*2; i += 16)
		{
			//__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask);
			//__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask);
			//__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0);
			//*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1);
			__m64 m0 = *(__m64 *)input;
			__m64 m2 = _mm_srli_si64(m0, 8);
			__m64 m3 = _mm_slli_si64(m0, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m0 = m2;
			__m64 m1 = *(__m64 *)(input+8);
			m2 = _mm_srli_si64(m1, 8);
			m3 = _mm_slli_si64(m1, 8);
			m3 = _mm_and_si64 (m3, chroma_mask);
			m2 = _mm_and_si64 (m2, luma_mask);
			m2 = _mm_or_si64 (m2, m3);
			m2= _mm_and_si64 (m2, luma_mask);
			m1 = m2;
			*(__m64 *)y_comp = _mm_packs_pu16 (m0, m1);

			y_comp += 8;
			input += 16;
		}
	}
	/* Do the chroma components */
	input = orig_input;
	for (j = start_line; j < height; j++)
	{
		/* Process every line for yuv 4:2:2 */
		for (i = 0; i < width*2; i += 16)
		{
			__m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8));
			__m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8));

			__m64 m3 = _mm_unpacklo_pi8 (m1, m2);
			__m64 m4 = _mm_unpackhi_pi8 (m1, m2);
			//*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2);
			//*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2);
			memcpy (u_comp, &m3, 4);
			memcpy (v_comp, &m4, 4);
			u_comp += 4;
			v_comp += 4;
			input += 16;
		}
	}
	_mm_empty();        // Clear aliased fp register state
}
コード例 #3
0
ファイル: sha384.c プロジェクト: 0culus/beecrypt-4.2.1
void sha384Process(register sha384Param* sp)
{
	#ifdef OPTIMIZE_SSE2 
	
	# if defined(_MSC_VER) || defined (__INTEL_COMPILER)
	static const __m64 MASK = { 0x00FF00FF00FF00FF00 };
	# elif defined(__GNUC__)
	static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF };
	# else
	#  error
	# endif

	__m64 a, b, c, d, e, f, g, h, temp;
	register       __m64 *w;
	register const __m64 *k;
	register byte t;

	w = (__m64*) sp->data;
	t = 16;
	while (t--)
	{
		temp = *w;
		*(w++) = _m_pxor(
				_mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8),
				_m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27)
			);
	}

	t = 64;
	while (t--)
	{
		temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16]));
		*(w++) = temp;
	}

	w = (__m64*) sp->h;

	a = w[0]; b = w[1]; c = w[2]; d = w[3];
	e = w[4]; f = w[5]; g = w[6]; h = w[7];

	w = (__m64*) sp->data;
	k = (__m64*) SHA2_64BIT_K;

	#else

	register uint64_t a, b, c, d, e, f, g, h, temp;
	register       uint64_t *w;
	register const uint64_t *k;
	register byte t;

	# if WORDS_BIGENDIAN
	w = sp->data + 16;
	# else
	w = sp->data;
	t = 16;
	while (t--)
	{
		temp = swapu64(*w);
		*(w++) = temp;
	}
	# endif

	t = 64;
	while (t--)
	{
		temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16];
		*(w++) = temp;
	}

	w = sp->data;

	a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3];
	e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7];

	k = SHA2_64BIT_K;
	#endif

	ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]);
	ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]);
	ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]);
	ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]);
	ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]);
	ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]);
	ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]);
	ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]);
	ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]);
	ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]);
	ROUND(g,h,a,b,c,d,e,f,w[10],k[10]);
	ROUND(f,g,h,a,b,c,d,e,w[11],k[11]);
	ROUND(e,f,g,h,a,b,c,d,w[12],k[12]);
	ROUND(d,e,f,g,h,a,b,c,w[13],k[13]);
	ROUND(c,d,e,f,g,h,a,b,w[14],k[14]);
	ROUND(b,c,d,e,f,g,h,a,w[15],k[15]);
	ROUND(a,b,c,d,e,f,g,h,w[16],k[16]);
	ROUND(h,a,b,c,d,e,f,g,w[17],k[17]);
	ROUND(g,h,a,b,c,d,e,f,w[18],k[18]);
	ROUND(f,g,h,a,b,c,d,e,w[19],k[19]);
	ROUND(e,f,g,h,a,b,c,d,w[20],k[20]);
	ROUND(d,e,f,g,h,a,b,c,w[21],k[21]);
	ROUND(c,d,e,f,g,h,a,b,w[22],k[22]);
	ROUND(b,c,d,e,f,g,h,a,w[23],k[23]);
	ROUND(a,b,c,d,e,f,g,h,w[24],k[24]);
	ROUND(h,a,b,c,d,e,f,g,w[25],k[25]);
	ROUND(g,h,a,b,c,d,e,f,w[26],k[26]);
	ROUND(f,g,h,a,b,c,d,e,w[27],k[27]);
	ROUND(e,f,g,h,a,b,c,d,w[28],k[28]);
	ROUND(d,e,f,g,h,a,b,c,w[29],k[29]);
	ROUND(c,d,e,f,g,h,a,b,w[30],k[30]);
	ROUND(b,c,d,e,f,g,h,a,w[31],k[31]);
	ROUND(a,b,c,d,e,f,g,h,w[32],k[32]);
	ROUND(h,a,b,c,d,e,f,g,w[33],k[33]);
	ROUND(g,h,a,b,c,d,e,f,w[34],k[34]);
	ROUND(f,g,h,a,b,c,d,e,w[35],k[35]);
	ROUND(e,f,g,h,a,b,c,d,w[36],k[36]);
	ROUND(d,e,f,g,h,a,b,c,w[37],k[37]);
	ROUND(c,d,e,f,g,h,a,b,w[38],k[38]);
	ROUND(b,c,d,e,f,g,h,a,w[39],k[39]);
	ROUND(a,b,c,d,e,f,g,h,w[40],k[40]);
	ROUND(h,a,b,c,d,e,f,g,w[41],k[41]);
	ROUND(g,h,a,b,c,d,e,f,w[42],k[42]);
	ROUND(f,g,h,a,b,c,d,e,w[43],k[43]);
	ROUND(e,f,g,h,a,b,c,d,w[44],k[44]);
	ROUND(d,e,f,g,h,a,b,c,w[45],k[45]);
	ROUND(c,d,e,f,g,h,a,b,w[46],k[46]);
	ROUND(b,c,d,e,f,g,h,a,w[47],k[47]);
	ROUND(a,b,c,d,e,f,g,h,w[48],k[48]);
	ROUND(h,a,b,c,d,e,f,g,w[49],k[49]);
	ROUND(g,h,a,b,c,d,e,f,w[50],k[50]);
	ROUND(f,g,h,a,b,c,d,e,w[51],k[51]);
	ROUND(e,f,g,h,a,b,c,d,w[52],k[52]);
	ROUND(d,e,f,g,h,a,b,c,w[53],k[53]);
	ROUND(c,d,e,f,g,h,a,b,w[54],k[54]);
	ROUND(b,c,d,e,f,g,h,a,w[55],k[55]);
	ROUND(a,b,c,d,e,f,g,h,w[56],k[56]);
	ROUND(h,a,b,c,d,e,f,g,w[57],k[57]);
	ROUND(g,h,a,b,c,d,e,f,w[58],k[58]);
	ROUND(f,g,h,a,b,c,d,e,w[59],k[59]);
	ROUND(e,f,g,h,a,b,c,d,w[60],k[60]);
	ROUND(d,e,f,g,h,a,b,c,w[61],k[61]);
	ROUND(c,d,e,f,g,h,a,b,w[62],k[62]);
	ROUND(b,c,d,e,f,g,h,a,w[63],k[63]);
	ROUND(a,b,c,d,e,f,g,h,w[64],k[64]);
	ROUND(h,a,b,c,d,e,f,g,w[65],k[65]);
	ROUND(g,h,a,b,c,d,e,f,w[66],k[66]);
	ROUND(f,g,h,a,b,c,d,e,w[67],k[67]);
	ROUND(e,f,g,h,a,b,c,d,w[68],k[68]);
	ROUND(d,e,f,g,h,a,b,c,w[69],k[69]);
	ROUND(c,d,e,f,g,h,a,b,w[70],k[70]);
	ROUND(b,c,d,e,f,g,h,a,w[71],k[71]);
	ROUND(a,b,c,d,e,f,g,h,w[72],k[72]);
	ROUND(h,a,b,c,d,e,f,g,w[73],k[73]);
	ROUND(g,h,a,b,c,d,e,f,w[74],k[74]);
	ROUND(f,g,h,a,b,c,d,e,w[75],k[75]);
	ROUND(e,f,g,h,a,b,c,d,w[76],k[76]);
	ROUND(d,e,f,g,h,a,b,c,w[77],k[77]);
	ROUND(c,d,e,f,g,h,a,b,w[78],k[78]);
	ROUND(b,c,d,e,f,g,h,a,w[79],k[79]);

	#ifdef OPTIMIZE_SSE2
	w = (__m64*) sp->h;
	w[0] = _mm_add_si64(w[0], a);
	w[1] = _mm_add_si64(w[1], b);
	w[2] = _mm_add_si64(w[2], c);
	w[3] = _mm_add_si64(w[3], d);
	w[4] = _mm_add_si64(w[4], e);
	w[5] = _mm_add_si64(w[5], f);
	w[6] = _mm_add_si64(w[6], g);
	w[7] = _mm_add_si64(w[7], h);
	_mm_empty();
	#else
	sp->h[0] += a;
	sp->h[1] += b;
	sp->h[2] += c;
	sp->h[3] += d;
	sp->h[4] += e;
	sp->h[5] += f;
	sp->h[6] += g;
	sp->h[7] += h;
	#endif
}
コード例 #4
0
void
mlib_m_ImageMaximum_U8_3(
    mlib_s32 *res32,
    const mlib_image *img)
{
/* src address */
	__m64 *sp, *sl;

/* src data */
	__m64 sd;

/* max values */
	__m64 max0, max1, max2, max3;

/* edge mask */
	mlib_s32 emask;

/* loop variables */
	mlib_s32 n1;

/* height of image */
	mlib_s32 height = mlib_ImageGetHeight(img);

/* elements to next row */
	mlib_s32 slb = mlib_ImageGetStride(img);
	mlib_s32 width = mlib_ImageGetWidth(img) * 3;

	mlib_u8 *dend;

	if (slb == width) {
		width *= height;
		height = 1;
	}

	sp = sl = (__m64 *) mlib_ImageGetData(img);

	max1 = _mm_set1_pi8(MLIB_U8_MIN);
	max2 = _mm_set1_pi8(MLIB_U8_MIN);
	max3 = _mm_set1_pi8(MLIB_U8_MIN);

	for (; height > 0; height--) {

		n1 = width;
		dend = (mlib_u8 *)sp + width;

		for (; n1 > 23; n1 -= 24) {
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8(max1, max1, sd);
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8(max2, max2, sd);
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8(max3, max3, sd);
		}

		if (n1 > 0) {
			emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1));
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U8_M32(max1, max1, sd, emask);

			n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp);
			if (n1 > 0) {
				emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1));
				sd = (*sp++);
				MLIB_M_IMAGE_MAXIMUM_U8_M32(max2, max2, sd,
				    emask);

				n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp);
				if (n1 > 0) {
					emask = (0xFF << (8 - n1));
					sd = *sp;
					MLIB_M_IMAGE_MAXIMUM_U8_M32(max3, max3,
					    sd, emask);
				}
			}
		}

		sp = sl = (__m64 *) ((mlib_u8 *)sl + slb);
	}

	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max1, _mm_srli_si64(max2, 8),
	    mmx_write_64(0x00ffffffffffffffll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max2, 16),
	    mmx_write_64(0x0000000000ff0000ll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max3, 16),
	    mmx_write_64(0x0000ffffffffffffll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max3, 8),
	    mmx_write_64(0x0000000000ffff00ll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24),
	    mmx_write_64(0x000000ffff000000ll));
	MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24),
	    mmx_write_64(0x0000000000ffffffll));

	res32[0] = _mm_cvtsi64_si32(_mm_and_si64(max0,
				mmx_write_64(0x00000000000000ffll)));
	res32[1] =
	    _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 8),
	    mmx_write_64(0x00000000000000ffll)));
	res32[2] =
	    _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 16),
	    mmx_write_64(0x00000000000000ffll)));

	_mm_empty();
}
コード例 #5
0
__m64 test_mm_slli_si64(__m64 a) {
  // CHECK-LABEL: test_mm_slli_si64
  // CHECK: call x86_mmx @llvm.x86.mmx.pslli.q
  return _mm_slli_si64(a, 3);
}
コード例 #6
0
ファイル: dxr_advm.cpp プロジェクト: VWarlock/pentevo
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
{
   const unsigned char
      *u = src + ((y-1) & 7)*sc2lines_width,
      *m = src + ((y+0) & 7)*sc2lines_width,
      *l = src + ((y+1) & 7)*sc2lines_width;

   for (unsigned i = 0; i < nPix; i += 4) {

      if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) {

         __m64 mm = *(__m64*)(m+i-2);
         __m64 uu = *(__m64*)(u+i-2);
         __m64 ll = *(__m64*)(l+i-2);
         __m64 md = _mm_slli_si64(mm,8);
         __m64 mf = _mm_srli_si64(mm,8);
         __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll));

         __m64 e0, e1, v1, v2;

         e0 = _mm_cmpeq_pi8(md,uu);
         e0 = _mm_andnot_si64(maskall, e0);
         e0 = _mm_srli_si64(e0,16);
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());

         e1 = _mm_cmpeq_pi8(mf,uu);
         e1 = _mm_andnot_si64(maskall, e1);
         e1 = _mm_srli_si64(e1,16);
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);

         e0 = _mm_or_si64(e0, e1);

         v1 = _m_from_int(*(unsigned*)(m+i));
         v2 = _m_from_int(*(unsigned*)(u+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         v2 = _mm_unpacklo_pi8(v2,v2);

         *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );

         e0 = _mm_cmpeq_pi8(md,ll);
         e0 = _mm_andnot_si64(maskall, e0);
         e0 = _mm_srli_si64(e0,16);
         e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64());

         e1 = _mm_cmpeq_pi8(mf,ll);
         e1 = _mm_andnot_si64(maskall, e1);
         e1 = _mm_srli_si64(e1,16);
         e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1);

         e0 = _mm_or_si64(e0, e1);

         v1 = _m_from_int(*(unsigned*)(m+i));
         v2 = _m_from_int(*(unsigned*)(l+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         v2 = _mm_unpacklo_pi8(v2,v2);

         *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) );

      } else {

         __m64 v1 = _m_from_int(*(unsigned*)(m+i));
         v1 = _mm_unpacklo_pi8(v1,v1);
         *(__m64*)(dst1 + 2*i) = v1;
         *(__m64*)(dst2 + 2*i) = v1;

      }

   }
}