Пример #1
0
int oneThread(int threadId)
{
	int *aa;
	int *bb;
	int k;
	int itr;

	aa = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16);
	bb = (int *)_mm_malloc(sizeof(int)*ARRAY_SIZE, 16);

	memset(&aa[0], 1, ARRAY_SIZE*4);
	memset(&bb[0], 2, ARRAY_SIZE*4);

  __m128i a0,a1,a2,a3,b0,b1,b2,b3;
  __m128i a4,a5,a6,a7,b4,b5,b6,b7;
	__m128i c0,c1,c2,c3;
	__m128i c4,c5,c6,c7;
	__m128i cc;
	cc = _mm_set_epi32 (0, 0, 0, 0);

	for (k = 0; k < REPS; k++) 
	{
		for (itr = 0; itr<ARRAY_SIZE; itr+=32)
		{
			a0 = _mm_load_si128((__m128i*)&aa[itr]);
			a1 = _mm_load_si128((__m128i*)&aa[itr+4]);	
			a2 = _mm_load_si128((__m128i*)&aa[itr+8]);	
			a3 = _mm_load_si128((__m128i*)&aa[itr+12]);	
			a4 = _mm_load_si128((__m128i*)&aa[itr+16]);
			a5 = _mm_load_si128((__m128i*)&aa[itr+20]);	
			a6 = _mm_load_si128((__m128i*)&aa[itr+24]);	
			a7 = _mm_load_si128((__m128i*)&aa[itr+28]);	
			b0 = _mm_load_si128((__m128i*)&bb[itr]);
			b1 = _mm_load_si128((__m128i*)&bb[itr+4]);	
			b2 = _mm_load_si128((__m128i*)&bb[itr+8]);	
			b3 = _mm_load_si128((__m128i*)&bb[itr+12]);	
			b4 = _mm_load_si128((__m128i*)&bb[itr+16]);
			b5 = _mm_load_si128((__m128i*)&bb[itr+20]);	
			b6 = _mm_load_si128((__m128i*)&bb[itr+24]);	
			b7 = _mm_load_si128((__m128i*)&bb[itr+28]);	

			c0 = _mm_mul_epi32(a0, b0);
			c1 = _mm_mul_epi32(a1, b1);
			c2 = _mm_mul_epi32(a2, b2);
			c3 = _mm_mul_epi32(a3, b3);
			c4 = _mm_mul_epi32(a4, b4);
			c5 = _mm_mul_epi32(a5, b5);
			c6 = _mm_mul_epi32(a6, b6);
			c7 = _mm_mul_epi32(a7, b7);

			c0 = _mm_add_epi32(c0,c1);
			c1 = _mm_add_epi32(c2,c3);
			c2 = _mm_add_epi32(c4,c5);
			c3 = _mm_add_epi32(c6,c7);
			c0 = _mm_add_epi32(c0,c1);
			c1 = _mm_add_epi32(c2,c3);
			c0 = _mm_add_epi32(c0,c1);
			cc = _mm_add_epi32(cc,c0);

		}	
	}
	cc = _mm_hadd_epi32(cc,cc);
	cc = _mm_hadd_epi32(cc,cc);

	int count =0;
	count = _mm_cvtsi128_si32(cc) ;	

	free(aa);
	free(bb);

	return count;
}
Пример #2
0
PRBool
gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
                                   const gfxImageSurface* whiteSurf)
{
    gfxIntSize size = blackSurf->GetSize();

    if (size != whiteSurf->GetSize() ||
            (blackSurf->Format() != gfxASurface::ImageFormatARGB32 &&
             blackSurf->Format() != gfxASurface::ImageFormatRGB24) ||
            (whiteSurf->Format() != gfxASurface::ImageFormatARGB32 &&
             whiteSurf->Format() != gfxASurface::ImageFormatRGB24))
        return PR_FALSE;

    blackSurf->Flush();
    whiteSurf->Flush();

    unsigned char* blackData = blackSurf->Data();
    unsigned char* whiteData = whiteSurf->Data();

    if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
            (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
        // Cannot keep these in alignment.
        return PR_FALSE;
    }

    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);

    for (PRInt32 i = 0; i < size.height; ++i) {
        PRInt32 j = 0;
        // Loop single pixels until at 4 byte alignment.
        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
            *((PRUint32*)blackData) =
                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
                             *reinterpret_cast<PRUint32*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        // This extra loop allows the compiler to do some more clever registry
        // management and makes it about 5% faster than with only the 4 pixel
        // at a time loop.
        for (; j < size.width - 8; j += 8) {
            __m128i black1 = _mm_load_si128((__m128i*)blackData);
            __m128i white1 = _mm_load_si128((__m128i*)whiteData);
            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
            __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));

            // Execute the same instructions as described in RecoverPixel, only
            // using an SSE2 packed saturated subtract.
            white1 = _mm_subs_epu8(white1, black1);
            white2 = _mm_subs_epu8(white2, black2);
            white1 = _mm_subs_epu8(greenMask, white1);
            white2 = _mm_subs_epu8(greenMask, white2);
            // Producing the final black pixel in an XMM register and storing
            // that is actually faster than doing a masked store since that
            // does an unaligned storage. We have the black pixel in a register
            // anyway.
            black1 = _mm_andnot_si128(alphaMask, black1);
            black2 = _mm_andnot_si128(alphaMask, black2);
            white1 = _mm_slli_si128(white1, 2);
            white2 = _mm_slli_si128(white2, 2);
            white1 = _mm_and_si128(alphaMask, white1);
            white2 = _mm_and_si128(alphaMask, white2);
            black1 = _mm_or_si128(white1, black1);
            black2 = _mm_or_si128(white2, black2);

            _mm_store_si128((__m128i*)blackData, black1);
            _mm_store_si128((__m128i*)(blackData + 16), black2);
            blackData += 32;
            whiteData += 32;
        }
        for (; j < size.width - 4; j += 4) {
            __m128i black = _mm_load_si128((__m128i*)blackData);
            __m128i white = _mm_load_si128((__m128i*)whiteData);

            white = _mm_subs_epu8(white, black);
            white = _mm_subs_epu8(greenMask, white);
            black = _mm_andnot_si128(alphaMask, black);
            white = _mm_slli_si128(white, 2);
            white = _mm_and_si128(alphaMask, white);
            black = _mm_or_si128(white, black);
            _mm_store_si128((__m128i*)blackData, black);
            blackData += 16;
            whiteData += 16;
        }
        // Loop single pixels until we're done.
        while (j < size.width) {
            *((PRUint32*)blackData) =
                RecoverPixel(*reinterpret_cast<PRUint32*>(blackData),
                             *reinterpret_cast<PRUint32*>(whiteData));
            blackData += 4;
            whiteData += 4;
            j++;
        }
        blackData += blackSurf->Stride() - j * 4;
        whiteData += whiteSurf->Stride() - j * 4;
    }

    blackSurf->MarkDirty();

    return PR_TRUE;
}
Пример #3
0
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
{
	int y, n;
	INT16* l_ptr = l;
	INT16* h_ptr = h;
	INT16* dst_ptr = dst;
	int first;
	int last;
	__m128i l_n;
	__m128i h_n;
	__m128i h_n_m;
	__m128i tmp_n;
	__m128i dst_n;
	__m128i dst_n_p;
	__m128i dst1;
	__m128i dst2;

	for (y = 0; y < subband_width; y++)
	{
		/* Even coefficients */
		for (n = 0; n < subband_width; n += 8)
		{
			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
			
			l_n = _mm_load_si128((__m128i*) l_ptr);

			h_n = _mm_load_si128((__m128i*) h_ptr);
			h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));

			if (n == 0)
			{
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			}
			
			tmp_n = _mm_add_epi16(h_n, h_n_m);
			tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			dst_n = _mm_sub_epi16(l_n, tmp_n);
			
			_mm_store_si128((__m128i*) l_ptr, dst_n);
			
			l_ptr += 8;
			h_ptr += 8;
		}

		l_ptr -= subband_width;
		h_ptr -= subband_width;
		
		/* Odd coefficients */
		for (n = 0; n < subband_width; n += 8)
		{
			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
			
			h_n = _mm_load_si128((__m128i*) h_ptr);
			
			h_n = _mm_slli_epi16(h_n, 1);
			
			dst_n = _mm_load_si128((__m128i*) (l_ptr));
			dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));

			if (n == subband_width - 8)
			{
				last = _mm_extract_epi16(dst_n_p, 6);
				dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
			}
			
			tmp_n = _mm_add_epi16(dst_n_p, dst_n);
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			tmp_n = _mm_add_epi16(tmp_n, h_n);
			
			dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
			dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
			
			_mm_store_si128((__m128i*) dst_ptr, dst1);
			_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
			
			l_ptr += 8;
			h_ptr += 8;
			dst_ptr += 16;
		}
	}
}
mlib_status
mlib_VideoColorBGR2JFIFYCC444_S16_aligned(
	mlib_s16 *y,
	mlib_s16 *cb,
	mlib_s16 *cr,
	const mlib_s16 *bgr,
	mlib_s32 n)
{
	/* 0.299*32768 */
	const __m128i x_c11 = _mm_set1_epi16(9798);

	/* 0.587*32768 */
	const __m128i x_c12 = _mm_set1_epi16(19235);

	/* 0.114*32768 */
	const __m128i x_c13 = _mm_set1_epi16(3735);

	/* -0.16874*32768 */
	const __m128i x_c21 = _mm_set1_epi16(-5529);

	/* -0.33126*32768 */
	const __m128i x_c22 = _mm_set1_epi16(-10855);

	/* 0.5*32768 */
	const __m128i x_c23 = _mm_set1_epi16(16384);

	/* 0.5*32768 */
	const __m128i x_c31 = x_c23;

	/* -0.41869*32768 */
	const __m128i x_c32 = _mm_set1_epi16(-13720);

	/* -0.08131*32768 */
	const __m128i x_c33 = _mm_set1_epi16(-2664);

	/* 2048 */
	const __m128i x_coff = _mm_set1_epi16(2048 << 2);

	const __m128i x_zero = _mm_setzero_si128();

	__m128i x_bgr0, x_bgr1, x_bgr2, x_r, x_g, x_b;
	__m128i x_y, x_cb, x_cr;
	__m128i x_t0, x_t1, x_t2, x_t3, x_t4, x_t5;
	__m128i *px_y, *px_cb, *px_cr, *px_bgr;
	mlib_d64 fr, fg, fb, fy, fcb, fcr;
	mlib_s32 i;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	px_bgr = (__m128i *)bgr;
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (i = 0; i <= (n - 8); i += 8) {
		x_bgr0 = _mm_load_si128(px_bgr++);
		x_bgr0 = _mm_slli_epi16(x_bgr0, 3);
		x_bgr1 = _mm_load_si128(px_bgr++);
		x_bgr1 = _mm_slli_epi16(x_bgr1, 3);
		x_bgr2 = _mm_load_si128(px_bgr++);
		x_bgr2 = _mm_slli_epi16(x_bgr2, 3);
		SeparateBGR48_S16;

		x_t0 = _mm_mulhi_epi16(x_r, x_c11);
		x_t1 = _mm_mulhi_epi16(x_g, x_c12);
		x_t2 = _mm_mulhi_epi16(x_b, x_c13);
		x_y = _mm_add_epi16(x_t0, x_t1);
		x_y = _mm_add_epi16(x_y, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c21);
		x_t1 = _mm_mulhi_epi16(x_g, x_c22);
		x_t2 = _mm_mulhi_epi16(x_b, x_c23);
		x_cb = _mm_add_epi16(x_t0, x_t1);
		x_cb = _mm_add_epi16(x_cb, x_coff);
		x_cb = _mm_add_epi16(x_cb, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c31);
		x_t1 = _mm_mulhi_epi16(x_g, x_c32);
		x_t2 = _mm_mulhi_epi16(x_b, x_c33);
		x_cr = _mm_add_epi16(x_t0, x_t1);
		x_cr = _mm_add_epi16(x_cr, x_coff);
		x_cr = _mm_add_epi16(x_cr, x_t2);

		/* save */
		x_y = _mm_srli_epi16(x_y, 2);
		x_cb = _mm_srli_epi16(x_cb, 2);
		x_cr = _mm_srli_epi16(x_cr, 2);
		_mm_store_si128(px_y++, x_y);
		_mm_store_si128(px_cb++, x_cb);
		_mm_store_si128(px_cr++, x_cr);
	}

	if (i <= (n - 4)) {
		x_bgr0 = _mm_load_si128(px_bgr++);
		x_bgr0 = _mm_slli_epi16(x_bgr0, 3);
		x_bgr1 = _mm_loadl_epi64(px_bgr);
		x_bgr1 = _mm_slli_epi16(x_bgr1, 3);
		px_bgr = (__m128i *)((__m64 *)px_bgr + 1);
		SeparateBGR24_S16;

		x_t0 = _mm_mulhi_epi16(x_r, x_c11);
		x_t1 = _mm_mulhi_epi16(x_g, x_c12);
		x_t2 = _mm_mulhi_epi16(x_b, x_c13);
		x_y = _mm_add_epi16(x_t0, x_t1);
		x_y = _mm_add_epi16(x_y, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c21);
		x_t1 = _mm_mulhi_epi16(x_g, x_c22);
		x_t2 = _mm_mulhi_epi16(x_b, x_c23);
		x_cb = _mm_add_epi16(x_t0, x_t1);
		x_cb = _mm_add_epi16(x_cb, x_coff);
		x_cb = _mm_add_epi16(x_cb, x_t2);

		x_t0 = _mm_mulhi_epi16(x_r, x_c31);
		x_t1 = _mm_mulhi_epi16(x_g, x_c32);
		x_t2 = _mm_mulhi_epi16(x_b, x_c33);
		x_cr = _mm_add_epi16(x_t0, x_t1);
		x_cr = _mm_add_epi16(x_cr, x_coff);
		x_cr = _mm_add_epi16(x_cr, x_t2);

		/* save */
		x_y = _mm_srli_epi16(x_y, 2);
		x_cb = _mm_srli_epi16(x_cb, 2);
		x_cr = _mm_srli_epi16(x_cr, 2);
		_mm_storel_epi64(px_y, x_y);
		px_y = (__m128i *)((__m64 *)px_y + 1);
		_mm_storel_epi64(px_cb, x_cb);
		px_cb = (__m128i *)((__m64 *)px_cb + 1);
		_mm_storel_epi64(px_cr, x_cr);
		px_cr = (__m128i *)((__m64 *)px_cr + 1);

		i += 4;
	}

	for (; i <= (n - 1); i++) {
		fb = bgr[3 * i];
		fg = bgr[3 * i + 1];
		fr = bgr[3 * i + 2];

		fy = 0.29900f * fr + 0.58700f * fg + 0.11400f * fb;
		fcb = -0.16874f * fr - 0.33126f * fg + 0.50000f * fb + 2048;
		fcr = 0.50000f * fr - 0.41869f * fg - 0.08131f * fb + 2048;

		y[i] = (mlib_s16)fy;
		cb[i] = (mlib_s16)fcb;
		cr[i] = (mlib_s16)fcr;
	}

	return (MLIB_SUCCESS);
}
Пример #5
0
int crypto_hash(unsigned char *out,const unsigned char *in,unsigned long long inlen)
{
	hashState state;
	u_int32_t *data32, *data32_end;
	u_int64_t *data64;
	unsigned char *lastPartP, *data8_end;

	#ifdef __x86_64__
	u_int64_t i, iterations, counter, databyteLength;
	#else
	int i, iterations, counter, databyteLength;
	#endif
	// This might be a static check
	if (crypto_hash_BYTES != 32)
		return -1;
	
	databyteLength = inlen; // Want it to be the native data size, and not bigger.

	#ifdef __SSE__
	// Use SSE here, if it is available
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[0], _mm_load_si128((__m128i *) &i256p2[0]));
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[4], _mm_load_si128((__m128i *) &i256p2[4]));
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[8], _mm_load_si128((__m128i *) &i256p2[8]));
	_mm_store_si128((__m128i *) &hashState256_(state).DoublePipe[12], _mm_load_si128((__m128i *) &i256p2[12]));
	#elif defined ( __x86_64__ )
	// Or 64-bit writes if on 64 bit system (not really possible on x86)
	hashState256_(state).DoublePipe[0] = i256p2[0];
	hashState256_(state).DoublePipe[2] = i256p2[2];
	hashState256_(state).DoublePipe[4] = i256p2[4];
	hashState256_(state).DoublePipe[6] = i256p2[6];
	hashState256_(state).DoublePipe[8] = i256p2[8];
	hashState256_(state).DoublePipe[10] = i256p2[10];
	hashState256_(state).DoublePipe[12] = i256p2[12];
	hashState256_(state).DoublePipe[14] = i256p2[14];
	#else
	// Fallback
	memcpy(hashState256_(state).DoublePipe, i256p2,  16 * sizeof(u_int32_t));
	#endif
	
	data32 = (u_int32_t *) in;
	iterations = databyteLength / BlueMidnightWish256_BLOCK_SIZE;
	data32_end = data32 + iterations*16;
	if(iterations > 0)
		Compress256(data32, data32_end, &state);
	
	databyteLength -= BlueMidnightWish256_BLOCK_SIZE * iterations;
	data64 = (u_int64_t *)hashState256_(state).LastPart;
	
	if (databyteLength < 56) {
		#ifdef __SSE__
		// Use SSE here, if it is available
		__m128i zero = _mm_setzero_si128();
		_mm_store_si128((__m128i *) &data64[0], zero);
		_mm_store_si128((__m128i *) &data64[2], zero);
		_mm_store_si128((__m128i *) &data64[4], zero);
		_mm_store_si128((__m128i *) &data64[6], zero);
		#elif defined ( __x86_64__ )
		// Or 64-bit writes if on 64 bit system (not really possible on x86)
		data64[0] = 0;
		data64[1] = 0;
		data64[2] = 0;
		data64[3] = 0;
		data64[4] = 0;
		data64[5] = 0;
		data64[6] = 0;
		data64[7] = 0;
		#else
		// Fallback
		memset( data64 + (databyteLength >> 4), 0x00, BlueMidnightWish256_BLOCK_SIZE - ((databyteLength >> 4) << 3));
		#endif
		
		
	}
Пример #6
0
/* Deinterleaves the 3 streams from the input (systematic and 2 parity bits) into 
 * 3 buffers ready to be used by compute_gamma() 
 */
void deinterleave_input(srslte_tdec_sse_t *h, int16_t *input, uint32_t long_cb) {
  uint32_t i;
 
  __m128i *inputPtr = (__m128i*) input; 
  __m128i in0, in1, in2;
  __m128i s0, s1, s2, s;
  __m128i p00, p01, p02, p0;
  __m128i p10, p11, p12, p1;
  
  __m128i *sysPtr = (__m128i*) h->syst; 
  __m128i *pa0Ptr = (__m128i*) h->parity0; 
  __m128i *pa1Ptr = (__m128i*) h->parity1; 
  
  // pick bits 0, 3, 6 from 1st word
  __m128i s0_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0);
  // pick bits 1, 4, 7 from 2st word
  __m128i s1_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff);
  // pick bits 2, 5 from 3rd word
  __m128i s2_mask = _mm_set_epi8(11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);

  // pick bits 1, 4, 7 from 1st word
  __m128i p00_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,15,14,9,8,3,2);
  // pick bits 2, 5, from 2st word
  __m128i p01_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4,0xff,0xff,0xff,0xff,0xff,0xff);
  // pick bits 0, 3, 6 from 3rd word
  __m128i p02_mask = _mm_set_epi8(13,12,7,6,1,0,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
  
  // pick bits 2, 5 from 1st word
  __m128i p10_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,11,10,5,4);
  // pick bits 0, 3, 6, from 2st word
  __m128i p11_mask = _mm_set_epi8(0xff,0xff,0xff,0xff,0xff,0xff,13,12,7,6,1,0,0xff,0xff,0xff,0xff);
  // pick bits 1, 4, 7 from 3rd word
  __m128i p12_mask = _mm_set_epi8(15,14,9,8,3,2,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
      
  // Split systematic and parity bits
  for (i = 0; i < long_cb/8; i++) {
        
    in0 = _mm_load_si128(inputPtr); inputPtr++; 
    in1 = _mm_load_si128(inputPtr); inputPtr++;   
    in2 = _mm_load_si128(inputPtr); inputPtr++;
    
    /* Deinterleave Systematic bits */
    s0 = _mm_shuffle_epi8(in0, s0_mask);
    s1 = _mm_shuffle_epi8(in1, s1_mask);
    s2 = _mm_shuffle_epi8(in2, s2_mask);    
    s = _mm_or_si128(s0, s1);
    s = _mm_or_si128(s, s2);

    _mm_store_si128(sysPtr, s);
    sysPtr++;

    /* Deinterleave parity 0 bits */
    p00 = _mm_shuffle_epi8(in0, p00_mask);
    p01 = _mm_shuffle_epi8(in1, p01_mask);
    p02 = _mm_shuffle_epi8(in2, p02_mask);    
    p0 = _mm_or_si128(p00, p01);
    p0 = _mm_or_si128(p0, p02);
    
    _mm_store_si128(pa0Ptr, p0);
    pa0Ptr++;

    /* Deinterleave parity 1 bits */
    p10 = _mm_shuffle_epi8(in0, p10_mask);
    p11 = _mm_shuffle_epi8(in1, p11_mask);
    p12 = _mm_shuffle_epi8(in2, p12_mask);    
    p1 = _mm_or_si128(p10, p11);
    p1 = _mm_or_si128(p1, p12);

    _mm_store_si128(pa1Ptr, p1);
    pa1Ptr++;    

  }
  
  for (i = 0; i < 3; i++) {
    h->syst[i+long_cb]    = input[3*long_cb + 2*i];
    h->parity0[i+long_cb] = input[3*long_cb + 2*i + 1];
  }
  for (i = 0; i < 3; i++) {
    h->app2[i+long_cb]    = input[3*long_cb + 6 + 2*i];
    h->parity1[i+long_cb] = input[3*long_cb + 6 + 2*i + 1];
  }

}
void
lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   int x = (arg.triangle.plane_mask & 0xff) + task->x;
   int y = (arg.triangle.plane_mask >> 8) + task->y;
   unsigned i, j;

   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
   unsigned nr = 0;

   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = _mm_setzero_si128();

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;
   __m128i rej4;

   __m128i dcdx2;
   __m128i dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;
   
   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &rej4);

   /* Adjust dcdx;
    */
   dcdx = _mm_sub_epi32(zero, dcdx);

   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
   rej4 = _mm_slli_epi32(rej4, 2);

   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));

   dcdx2 = _mm_add_epi32(dcdx, dcdx);
   dcdx3 = _mm_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);

   for (i = 0; i < 4; i++) {
      __m128i cx = c;

      for (j = 0; j < 4; j++) {
         __m128i c4rej = _mm_add_epi32(cx, rej4);
         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);

         /* if (is_zero(rej_masks)) */
         if (_mm_movemask_epi8(rej_masks) == 0) {
            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);

            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);

            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));

            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
            __m128i c_01 = _mm_packs_epi32(c_0, c_1);

            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));

            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);

            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));

            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);

            unsigned mask = _mm_movemask_epi8(c_0123);

            out[nr].i = i;
            out[nr].j = j;
            out[nr].mask = mask;
            if (mask != 0xffff)
               nr++;
         }
         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
      }

      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
   }

   for (i = 0; i < nr; i++)
      lp_rast_shade_quads_mask(task,
                               &tri->inputs,
                               x + 4 * out[i].j,
                               y + 4 * out[i].i,
                               0xffff & ~out[i].mask);
}
Пример #8
0
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor )
{
    __m128i t0, t1, t3, t4, t6, t7;

    // get bounding box
    // ----------------

    // load the first row
    t0 = _mm_load_si128 ( (__m128i*) colorBlock );
    t1 = _mm_load_si128 ( (__m128i*) colorBlock );

    __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) );
    // Minimum of Packed Unsigned Byte Integers
    t0 = _mm_min_epu8 ( t0, t16);
    // Maximum of Packed Unsigned Byte Integers
    t1 = _mm_max_epu8 ( t1, t16);

    __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) );
    t0 = _mm_min_epu8 ( t0, t32);
    t1 = _mm_max_epu8 ( t1, t32);

    __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) );
    t0 = _mm_min_epu8 ( t0, t48);
    t1 = _mm_max_epu8 ( t1, t48);

    // Shuffle Packed Doublewords
    t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
    t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );

    t0 = _mm_min_epu8 ( t0, t3);
    t1 = _mm_max_epu8 ( t1, t4);

    // Shuffle Packed Low Words
    t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) );
    t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) );

    t0 = _mm_min_epu8 ( t0, t6);
    t1 = _mm_max_epu8 ( t1, t7);

    // inset the bounding box
    // ----------------------

    // Unpack Low Data
    //__m128i t66 = _mm_set1_epi8( 0 );
    __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 );
    t0 = _mm_unpacklo_epi8(t0, t66);
    t1 = _mm_unpacklo_epi8(t1, t66);

    // copy (movdqa)
    //__m128i t2 = _mm_load_si128 ( &t1 );
    __m128i t2 = t1;

    // Subtract Packed Integers
    t2 = _mm_sub_epi16(t2, t0);

    // Shift Packed Data Right Logical
    t2 = _mm_srli_epi16(t2, INSET_SHIFT);

    // Add Packed Integers
    t0 = _mm_add_epi16(t0, t2);

    t1 = _mm_sub_epi16(t1, t2);

    // Pack with Unsigned Saturation
    t0 = _mm_packus_epi16(t0, t0);
    t1 = _mm_packus_epi16(t1, t1);

    // store bounding box extents
    // --------------------------
    _mm_store_si128 ( (__m128i*) minColor, t0 );
    _mm_store_si128 ( (__m128i*) maxColor, t1 );
}
Пример #9
0
int haraka512256(unsigned char *hash, const unsigned char *msg) {
	// stuff we need
	int i, j;
	__m128i s[4], tmp, rcon;
	__m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0);

	// set initial round constant
	rcon = _mm_set_epi32(1,1,1,1);

	// initialize state to msg
	s[0] = _mm_load_si128(&((__m128i*)msg)[0]);
	s[1] = _mm_load_si128(&((__m128i*)msg)[1]);
	s[2] = _mm_load_si128(&((__m128i*)msg)[2]);
	s[3] = _mm_load_si128(&((__m128i*)msg)[3]);

	//printf("= input state =\n");
	//printstate512(s[0], s[1], s[2], s[3]);

	for (i = 0; i < ROUNDS; ++i) {
		// aes round(s)
		for (j = 0; j < AES_PER_ROUND; ++j) {
			s[0] = _mm_aesenc_si128(s[0], rcon);
			s[1] = _mm_aesenc_si128(s[1], rcon);
			s[2] = _mm_aesenc_si128(s[2], rcon);
			s[3] = _mm_aesenc_si128(s[3], rcon);
			rcon = _mm_slli_epi32(rcon, 1);
		}

		//printf("= round %d : after aes layer =\n", i);
		//printstate512(s[0], s[1], s[2], s[3]);
		
		// mixing
		tmp  = _mm_unpacklo_epi32(s[0], s[1]);
		s[0] = _mm_unpackhi_epi32(s[0], s[1]);
		s[1] = _mm_unpacklo_epi32(s[2], s[3]);
		s[2] = _mm_unpackhi_epi32(s[2], s[3]);
		s[3] = _mm_unpacklo_epi32(s[0], s[2]);
		s[0] = _mm_unpackhi_epi32(s[0], s[2]);
		s[2] = _mm_unpackhi_epi32(s[1],  tmp);
		s[1] = _mm_unpacklo_epi32(s[1],  tmp);

		//printf("= round %d : after mix layer =\n", i);
		//printstate512(s[0], s[1], s[2], s[3]);

		// little-endian mixing (not used)
		// tmp  = _mm_unpackhi_epi32(s[1], s[0]);
		// s[0] = _mm_unpacklo_epi32(s[1], s[0]);
		// s[1] = _mm_unpackhi_epi32(s[3], s[2]);
		// s[2] = _mm_unpacklo_epi32(s[3], s[2]);
		// s[3] = _mm_unpackhi_epi32(s[2], s[0]);
		// s[0] = _mm_unpacklo_epi32(s[2], s[0]);
		// s[2] = _mm_unpacklo_epi32(tmp,  s[1]);
		// s[1] = _mm_unpackhi_epi32(tmp,  s[1]);
	}

	//printf("= output from permutation =\n");
	//printstate512(s[0], s[1], s[2], s[3]);

	// xor message to get DM effect
	s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0]));
	s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1]));
	s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2]));
	s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3]));

	//printf("= after feed-forward =\n");
	//printstate512(s[0], s[1], s[2], s[3]);

	// truncate and store result
	_mm_maskmoveu_si128(s[0], MSB64, (hash-8));
	_mm_maskmoveu_si128(s[1], MSB64, (hash+0));
	_mm_storel_epi64((__m128i*)(hash + 16), s[2]);
	_mm_storel_epi64((__m128i*)(hash + 24), s[3]);
}
Пример #10
0
__strspn_sse42 (const char *s, const char *a)
{
  if (*a == 0)
    return 0;

  const char *aligned;
  __m128i mask;
  int offset = (int) ((size_t) a & 15);
  if (offset != 0)
    {
      /* Load masks.  */
      aligned = (const char *) ((size_t) a & -16L);
      __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);

      mask = __m128i_shift_right (mask0, offset);

      /* Find where the NULL terminator is.  */
      int length = _mm_cmpistri (mask, mask, 0x3a);
      if (length == 16 - offset)
	{
	  /* There is no NULL terminator.  */
	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
	  length += index;

	  /* Don't use SSE4.2 if the length of A > 16.  */
	  if (length > 16)
	    return __strspn_sse2 (s, a);

	  if (index != 0)
	    {
	      /* Combine mask0 and mask1.  We could play games with
		 palignr, but frankly this data should be in L1 now
		 so do the merge via an unaligned load.  */
	      mask = _mm_loadu_si128 ((__m128i *) a);
	    }
	}
    }
  else
    {
      /* A is aligned.  */
      mask = _mm_load_si128 ((__m128i *) a);

      /* Find where the NULL terminator is.  */
      int length = _mm_cmpistri (mask, mask, 0x3a);
      if (length == 16)
	{
	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
	     of A > 16.  */
	  if (a[16] != 0)
	    return __strspn_sse2 (s, a);
	}
    }

  offset = (int) ((size_t) s & 15);
  if (offset != 0)
    {
      /* Check partial string.  */
      aligned = (const char *) ((size_t) s & -16L);
      __m128i value = _mm_load_si128 ((__m128i *) aligned);

      value = __m128i_shift_right (value, offset);

      int length = _mm_cmpistri (mask, value, 0x12);
      /* No need to check CFlag since it is always 1.  */
      if (length < 16 - offset)
	return length;
      /* Find where the NULL terminator is.  */
      int index = _mm_cmpistri (value, value, 0x3a);
      if (index < 16 - offset)
	return length;
      aligned += 16;
    }
  else
    aligned = s;

  while (1)
    {
      __m128i value = _mm_load_si128 ((__m128i *) aligned);
      int index = _mm_cmpistri (mask, value, 0x12);
      int cflag = _mm_cmpistrc (mask, value, 0x12);
      if (cflag)
	return (size_t) (aligned + index - s);
      aligned += 16;
    }
}
Пример #11
0
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData )
{
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );

	// mov esi, maxColor
	// mov edi, minColor

	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	t7 = _mm_setzero_si128();
	//t7 = _mm_xor_si128(t7, t7);
	_mm_store_si128 ( (__m128i*) &result, t7 );


	//t0 = _mm_load_si128 ( (__m128i*)  maxColor );
	t0 = _mm_cvtsi32_si128( *(int*)maxColor);

	// Bitwise AND
	__m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
	t0 = _mm_and_si128(t0, tt);

	t0 = _mm_unpacklo_epi8(t0, t7);

	t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	// Bitwise Logical OR
	t0 = _mm_or_si128(t0, t4);
	t0 = _mm_or_si128(t0, t5);   // t0 contains color0 in 565




	//t1 = _mm_load_si128 ( (__m128i*)  minColor );
	t1 = _mm_cvtsi32_si128( *(int*)minColor);

	t1 = _mm_and_si128(t1, tt);

	t1 = _mm_unpacklo_epi8(t1, t7);

	t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	t1 = _mm_or_si128(t1, t4);
	t1 = _mm_or_si128(t1, t5);  // t1 contains color1 in 565



	t2 = t0;

	t2 = _mm_packus_epi16(t2, t7);

	t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color0, t2 );

	t6 = t0;
	t6 = _mm_add_epi16(t6, t0);
	t6 = _mm_add_epi16(t6, t1);

	// Multiply Packed Signed Integers and Store High Result
	__m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
	t6 = _mm_mulhi_epi16(t6, tw3);
	t6 = _mm_packus_epi16(t6, t7);

	t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color2, t6 );

	t3 = t1;
	t3 = _mm_packus_epi16(t3, t7);
	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color1, t3 );

	t1 = _mm_add_epi16(t1, t1);
	t0 = _mm_add_epi16(t0, t1);

	t0 = _mm_mulhi_epi16(t0, tw3);
	t0 = _mm_packus_epi16(t0, t7);

	t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
	_mm_store_si128 ( (__m128i*) &color3, t0 );

	__m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
	__m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
	__m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);

	    // mov eax, 32
	    // mov esi, colorBlock
	int x = 32;
	//const byte *c = colorBlock;
	while (x >= 0)
	  {
	    t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
	    t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t0 = t3;
	    t6 = t5;
	    // Compute Sum of Absolute Difference
	    __m128i c0 = _mm_load_si128 ( (__m128i*)  color0 );
	    t0 = _mm_sad_epu8(t0, c0);
	    t6 = _mm_sad_epu8(t6, c0);
	    // Pack with Signed Saturation
	    t0 = _mm_packs_epi32 (t0, t6);

	    t1 = t3;
	    t6 = t5;
	    __m128i c1 = _mm_load_si128 ( (__m128i*)  color1 );
	    t1 = _mm_sad_epu8(t1, c1);
	    t6 = _mm_sad_epu8(t6, c1);
	    t1 = _mm_packs_epi32 (t1, t6);

	    t2 = t3;
	    t6 = t5;
	    __m128i c2 = _mm_load_si128 ( (__m128i*)  color2 );
	    t2 = _mm_sad_epu8(t2, c2);
	    t6 = _mm_sad_epu8(t6, c2);
	    t2 = _mm_packs_epi32 (t2, t6);

	    __m128i c3 = _mm_load_si128 ( (__m128i*)  color3 );
	    t3 = _mm_sad_epu8(t3, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t3 = _mm_packs_epi32 (t3, t5);


	    t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
	    t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c0);
	    t7 = _mm_sad_epu8(t7, c0);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t0 = _mm_packs_epi32 (t0, t6);  // d0

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c1);
	    t7 = _mm_sad_epu8(t7, c1);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t1 = _mm_packs_epi32 (t1, t6);  // d1

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c2);
	    t7 = _mm_sad_epu8(t7, c2);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t2 = _mm_packs_epi32 (t2, t6);  // d2

	    t4 = _mm_sad_epu8(t4, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t4 = _mm_packs_epi32 (t4, t5);
	    t3 = _mm_packs_epi32 (t3, t4);  // d3

	    t7 = _mm_load_si128 ( (__m128i*) result );

	    t7 = _mm_slli_epi32( t7, 16);

	    t4 = t0;
	    t5 = t1;
	    // Compare Packed Signed Integers for Greater Than
	    t0 = _mm_cmpgt_epi16(t0, t3); // b0
	    t1 = _mm_cmpgt_epi16(t1, t2); // b1
	    t4 = _mm_cmpgt_epi16(t4, t2); // b2
	    t5 = _mm_cmpgt_epi16(t5, t3); // b3
	    t2 = _mm_cmpgt_epi16(t2, t3); // b4

	    t4 = _mm_and_si128(t4, t1); // x0
	    t5 = _mm_and_si128(t5, t0); // x1
	    t2 = _mm_and_si128(t2, t0); // x2

	    t4 = _mm_or_si128(t4, t5);
	    t2 = _mm_and_si128(t2, w1);
	    t4 = _mm_and_si128(t4, w2);
	    t2 = _mm_or_si128(t2, t4);

	    t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));

	    // Unpack Low Data
	    t2 = _mm_unpacklo_epi16 ( t2, w0);
	    t5 = _mm_unpacklo_epi16 ( t5, w0);

	    //t5 = _mm_slli_si128 ( t5, 8);
	    t5 = _mm_slli_epi32( t5, 8);

	    t7 = _mm_or_si128(t7, t5);
	    t7 = _mm_or_si128(t7, t2);

	    _mm_store_si128 ( (__m128i*) &result, t7 );

	    x -=32;
	  }

	t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
	t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
	t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));

	t4 = _mm_slli_epi32 ( t4, 2);
	t5 = _mm_slli_epi32 ( t5, 4);
	t6 = _mm_slli_epi32 ( t6, 6);

	t7 = _mm_or_si128(t7, t4);
	t7 = _mm_or_si128(t7, t5);
	t7 = _mm_or_si128(t7, t6);

	//_mm_store_si128 ( (__m128i*) outData, t7 );

	int r = _mm_cvtsi128_si32 (t7);
	memcpy(outData, &r, 4);   // Anything better ?

	outData += 4;
}
Пример #12
0
mlib_status
__mlib_VectorSumAbsDiff_S32_Sat(
    mlib_d64 *z,
    const mlib_s32 *x,
    const mlib_s32 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3;
    mlib_s32 *px = (mlib_s32 *)x, *py = (mlib_s32 *)y;
    __m128i zero, xbuf, ybuf, zbuf, xlo, xhi, mext;
    mlib_d64 dsum = 0.0;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s32);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s32);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        *z = dsum;
    } else {
        for (i = 0; i < n1; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi32(ybuf, xbuf);
                xbuf = _mm_sub_epi32(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi32(xbuf, mext);
                xlo = _mm_unpacklo_epi32(xbuf, zero);
                xhi = _mm_unpackhi_epi32(xbuf, zero);
                zbuf = _mm_add_epi64(zbuf, xlo);
                zbuf = _mm_add_epi64(zbuf, xhi);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            dsum += mlib_fabs((mlib_d64)(*px++) - (*py++));
        }

        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
Пример #13
0
mlib_status
__mlib_VectorSumAbsDiff_S16_Sat(
    mlib_d64 *z,
    const mlib_s16 *x,
    const mlib_s16 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3, xval, sum = 0;
    mlib_s16 *px = (mlib_s16 *)x, *py = (mlib_s16 *)y;
    __m128i zero, xbuf, ybuf, zbuf32, zbuf64, xlo, xhi, mext;
    zero = _mm_setzero_si128();
    zbuf64 = zero;

    nstep = 16 / sizeof (mlib_s16);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s16);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            xval = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(xval);
        }
        *z = sum;
    } else {
        for (i = 0; i < n1; i++) {
            xval = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(xval);
        }
        mlib_s32 nblock = n2 >> 12;
        mlib_s32 tail = n2 & 4095;
        mlib_s32 k;
        if (ax == ay) {
            for (k = 0; k < nblock; k++) {
                zbuf32 = zero;
                for (i = 0; i < 4096; i++) {
                    xbuf = _mm_load_si128((__m128i *)px);
                    ybuf = _mm_load_si128((__m128i *)py);
                    mext = _mm_cmpgt_epi16(ybuf, xbuf);
                    xbuf = _mm_sub_epi16(xbuf, ybuf);
                    xbuf = _mm_xor_si128(xbuf, mext);
                    xbuf = _mm_sub_epi16(xbuf, mext);
                    xlo = _mm_unpacklo_epi16(xbuf, zero);
                    xhi = _mm_unpackhi_epi16(xbuf, zero);
                    zbuf32 = _mm_add_epi32(zbuf32, xlo);
                    zbuf32 = _mm_add_epi32(zbuf32, xhi);
                    px += nstep;
                    py += nstep;
                }
                xlo = _mm_unpacklo_epi32(zbuf32, zero);
                xhi = _mm_unpackhi_epi32(zbuf32, zero);
                zbuf64 = _mm_add_epi64(zbuf64, xlo);
                zbuf64 = _mm_add_epi64(zbuf64, xhi);
            }
            zbuf32 = zero;
            for (i = 0; i < tail; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi16(ybuf, xbuf);
                xbuf = _mm_sub_epi16(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi16(xbuf, mext);
                xlo = _mm_unpacklo_epi16(xbuf, zero);
                xhi = _mm_unpackhi_epi16(xbuf, zero);
                zbuf32 = _mm_add_epi32(zbuf32, xlo);
                zbuf32 = _mm_add_epi32(zbuf32, xhi);
                px += nstep;
                py += nstep;
            }
            xlo = _mm_unpacklo_epi32(zbuf32, zero);
            xhi = _mm_unpackhi_epi32(zbuf32, zero);
            zbuf64 = _mm_add_epi64(zbuf64, xlo);
            zbuf64 = _mm_add_epi64(zbuf64, xhi);
        } else { /* not aligned */
            for (k = 0; k < nblock; k++) {
                zbuf32 = zero;
                for (i = 0; i < 4096; i++) {
                    xbuf = _mm_load_si128((__m128i *)px);
                    ybuf = _mm_loadu_si128((__m128i *)py);
                    mext = _mm_cmpgt_epi16(ybuf, xbuf);
                    xbuf = _mm_sub_epi16(xbuf, ybuf);
                    xbuf = _mm_xor_si128(xbuf, mext);
                    xbuf = _mm_sub_epi16(xbuf, mext);
                    xlo = _mm_unpacklo_epi16(xbuf, zero);
                    xhi = _mm_unpackhi_epi16(xbuf, zero);
                    zbuf32 = _mm_add_epi32(zbuf32, xlo);
                    zbuf32 = _mm_add_epi32(zbuf32, xhi);
                    px += nstep;
                    py += nstep;
                }
                xlo = _mm_unpacklo_epi32(zbuf32, zero);
                xhi = _mm_unpackhi_epi32(zbuf32, zero);
                zbuf64 = _mm_add_epi64(zbuf64, xlo);
                zbuf64 = _mm_add_epi64(zbuf64, xhi);
            }
            zbuf32 = zero;
            for (i = 0; i < tail; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi16(ybuf, xbuf);
                xbuf = _mm_sub_epi16(xbuf, ybuf);
                xbuf = _mm_xor_si128(xbuf, mext);
                xbuf = _mm_sub_epi16(xbuf, mext);
                xlo = _mm_unpacklo_epi16(xbuf, zero);
                xhi = _mm_unpackhi_epi16(xbuf, zero);
                zbuf32 = _mm_add_epi32(zbuf32, xlo);
                zbuf32 = _mm_add_epi32(zbuf32, xhi);
                px += nstep;
                py += nstep;
            }
            xlo = _mm_unpacklo_epi32(zbuf32, zero);
            xhi = _mm_unpackhi_epi32(zbuf32, zero);
            zbuf64 = _mm_add_epi64(zbuf64, xlo);
            zbuf64 = _mm_add_epi64(zbuf64, xhi);
        }
        for (i = 0; i < n3; i++) {
            xval = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(xval);
        }

        mlib_d64 dsum = sum;
        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf64);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
Пример #14
0
mlib_status
__mlib_VectorSumAbsDiff_S8_Sat(
    mlib_d64 *z,
    const mlib_s8 *x,
    const mlib_s8 *y,
    mlib_s32 n)
{
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0;
    mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y;
    __m128i zero, xbuf, ybuf, zbuf, mext, mbuf;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s8);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s8);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }
        *z = sum;
    } else {
        for (i = 0; i < n1; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
            }
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
            }
        }
        for (i = 0; i < n3; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        }

        mlib_d64 dsum = sum;
        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    }
    return (MLIB_SUCCESS);
}
Пример #15
0
void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
                          int stride) {
  __m128i in0, in1, in2, in3;
  __m128i u0, u1;
  __m128i sum = _mm_setzero_si128();
  int i;

  for (i = 0; i < 2; ++i) {
    input += 8 * i;
    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));

    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));

    sum = _mm_add_epi16(sum, u1);
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));

    sum = _mm_add_epi16(sum, u1);
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
    in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
    in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
    in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));

    sum = _mm_add_epi16(sum, u1);
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    sum = _mm_add_epi16(sum, u1);
  }

  u0 = _mm_setzero_si128();
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);

  in1 = _mm_add_epi32(sum, in0);
  in1 = _mm_srai_epi32(in1, 1);
  store_output(&in1, output);
}
Пример #16
0
static  int blake64_compress( state * state, const u8 * datablock ) {

  __m128i row1a,row1b;
  __m128i row2a,row2b;
  __m128i row3a,row3b;
  __m128i row4a,row4b;
  __m128i buf1a,buf2a;
  static const u8 rot16[16] = {2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9};
  __m128i r16 = _mm_load_si128((__m128i*)rot16);


  u64 m[16]; 
  u64 y[16]; 

  /* constants and permutation */
  static const int sig[][16] = {
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 }, 
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
  };

  static const u64 z[16] = {
    0x243F6A8885A308D3ULL,0x13198A2E03707344ULL,
    0xA4093822299F31D0ULL,0x082EFA98EC4E6C89ULL,
    0x452821E638D01377ULL,0xBE5466CF34E90C6CULL,
    0xC0AC29B7C97C50DDULL,0x3F84D5B5B5470917ULL,
    0x9216D5D98979FB1BULL,0xD1310BA698DFB5ACULL,
    0x2FFD72DBD01ADFB7ULL,0xB8E1AFED6A267E96ULL,
    0xBA7C9045F12C7F99ULL,0x24A19947B3916CF7ULL,
    0x0801F2E2858EFC16ULL,0x636920D871574E69ULL
  };

  /* get message */
  m[ 0] = U8TO64(datablock +  0);
  m[ 1] = U8TO64(datablock +  8);
  m[ 2] = U8TO64(datablock + 16);
  m[ 3] = U8TO64(datablock + 24);
  m[ 4] = U8TO64(datablock + 32);
  m[ 5] = U8TO64(datablock + 40);
  m[ 6] = U8TO64(datablock + 48);
  m[ 7] = U8TO64(datablock + 56);
  m[ 8] = U8TO64(datablock + 64);
  m[ 9] = U8TO64(datablock + 72);
  m[10] = U8TO64(datablock + 80);
  m[11] = U8TO64(datablock + 88);
  m[12] = U8TO64(datablock + 96);
  m[13] = U8TO64(datablock +104);
  m[14] = U8TO64(datablock +112);
  m[15] = U8TO64(datablock +120);

  row1b = _mm_set_epi64((__m64)state->h[3],(__m64)state->h[2]);
  row1a = _mm_set_epi64((__m64)state->h[1],(__m64)state->h[0]);
  row2b = _mm_set_epi64((__m64)state->h[7],(__m64)state->h[6]);
  row2a = _mm_set_epi64((__m64)state->h[5],(__m64)state->h[4]);
  row3b = _mm_set_epi64((__m64)0x082EFA98EC4E6C89ULL,
			(__m64)0xA4093822299F31D0ULL);
  row3a = _mm_set_epi64((__m64)0x13198A2E03707344ULL,
			(__m64)0x243F6A8885A308D3ULL);
  
  if (state->nullt) {
    row4b = _mm_set_epi64((__m64)0x3F84D5B5B5470917ULL,
			  (__m64)0xC0AC29B7C97C50DDULL);
    row4a = _mm_set_epi64((__m64)0xBE5466CF34E90C6CULL,
			  (__m64)0x452821E638D01377ULL);
  }
  else {
    row4b = _mm_set_epi64((__m64)(0x3F84D5B5B5470917ULL^state->t[1]),
			  (__m64)(0xC0AC29B7C97C50DDULL^state->t[1]));
    row4a = _mm_set_epi64((__m64)(0xBE5466CF34E90C6CULL^state->t[0]),
			  (__m64)(0x452821E638D01377ULL^state->t[0]));
  }
  /* initialization ok (beware of bug on Celeron and P4!) */

  

#define round(r)\
    /* column step */\
    /***************************************************/\
    /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13  */		\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 2]], (__m64)m[sig[r][ 0]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 3]], (__m64)z[sig[r][ 1]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );		\
    row4a = _mm_xor_si128( row4a, row1a );				\
    row4a = _mm_shuffle_epi32(row4a, 0xB1); \
    row3a = _mm_add_epi64( row3a, row4a );				\
    row2a = _mm_xor_si128( row2a, row3a );				\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
  									\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 3]], (__m64)m[sig[r][ 1]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 2]], (__m64)z[sig[r][ 0]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );		\
    row4a = _mm_xor_si128( row4a, row1a );				\
    row4a = _mm_shuffle_epi8(row4a, r16); \
    row3a = _mm_add_epi64( row3a, row4a );				\
    row2a = _mm_xor_si128( row2a, row3a );				\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
  									\
    /* same stuff for low-order side */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 6]], (__m64)m[sig[r][ 4]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 7]], (__m64)z[sig[r][ 5]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );				\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );		\
    row4b = _mm_xor_si128( row4b, row1b );				\
    row4b = _mm_shuffle_epi32(row4b, 0xB1); \
    row3b = _mm_add_epi64( row3b, row4b );				\
    row2b = _mm_xor_si128( row2b, row3b );				\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][ 7]], (__m64)m[sig[r][ 5]] );	\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][ 6]], (__m64)z[sig[r][ 4]] );	\
    buf1a = _mm_xor_si128( buf1a, buf2a );					\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );		\
    row4b = _mm_xor_si128( row4b, row1b );				\
    row4b = _mm_shuffle_epi8(row4b, r16); \
    row3b = _mm_add_epi64( row3b, row4b );				\
    row2b = _mm_xor_si128( row2b, row3b );				\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
\
    /* shuffle */\
    _mm_store_si128( 0+ (__m128i *)y, row4a); \
    _mm_store_si128( 1+ (__m128i *)y, row4b); \
    row4a = row3a;\
    row3a = row3b;\
    row3b = row4a;\
    row4a  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );\
    row4b  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );\
    _mm_store_si128( 0+ (__m128i *)y, row2a);  \
    _mm_store_si128( 1+ (__m128i *)y, row2b);  \
    row2a  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    row2b  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    /* diagonal step */\
    /***************************************************/\
    /* high-order side: words 0, 1, 4, 5, 8, 9, 12, 13  */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][10]], (__m64)m[sig[r][ 8]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][11]], (__m64)z[sig[r][ 9]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
    row4a = _mm_xor_si128( row4a, row1a );		      \
    row4a = _mm_shuffle_epi32(row4a, 0xB1); \
    row3a = _mm_add_epi64( row3a, row4a );					\
    row2a = _mm_xor_si128( row2a, row3a );					\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 25 ),_mm_slli_epi64( row2a, 39 )); \
\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][11]], (__m64)m[sig[r][ 9]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][10]], (__m64)z[sig[r][ 8]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1a = _mm_add_epi64( _mm_add_epi64(row1a, buf1a), row2a );\
    row4a = _mm_xor_si128( row4a, row1a );			\
    row4a = _mm_shuffle_epi8(row4a, r16); \
    row3a = _mm_add_epi64( row3a, row4a );					\
    row2a = _mm_xor_si128( row2a, row3a );					\
    row2a = _mm_xor_si128(_mm_srli_epi64( row2a, 11 ),_mm_slli_epi64( row2a, 53 )); \
\
    /* same stuff for low-order side */\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][14]], (__m64)m[sig[r][12]] );\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][15]], (__m64)z[sig[r][13]] );\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
    row4b = _mm_xor_si128( row4b, row1b );			\
    buf2a = _mm_set_epi64( (__m64)m[sig[r][15]], (__m64)m[sig[r][13]] );\
    row4b = _mm_shuffle_epi32(row4b, 0xB1); \
    row3b = _mm_add_epi64( row3b, row4b );					\
    row2b = _mm_xor_si128( row2b, row3b );					\
    buf1a = _mm_set_epi64( (__m64)z[sig[r][14]], (__m64)z[sig[r][12]] );\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 25 ),_mm_slli_epi64( row2b, 39 )); \
\
    buf1a = _mm_xor_si128( buf1a, buf2a );\
    row1b = _mm_add_epi64( _mm_add_epi64(row1b, buf1a), row2b );\
    row4b = _mm_xor_si128( row4b, row1b );			\
    row4b = _mm_shuffle_epi8(row4b, r16); \
    row3b = _mm_add_epi64( row3b, row4b );					\
    row2b = _mm_xor_si128( row2b, row3b );					\
    row2b = _mm_xor_si128(_mm_srli_epi64( row2b, 11 ),_mm_slli_epi64( row2b, 53 )); \
\
    /* shuffle back */\
    buf1a = row3a;\
    row3a = row3b;\
    row3b = buf1a;\
    _mm_store_si128( 0+ (__m128i *)y, row2a);	\
    _mm_store_si128( 1+ (__m128i *)y, row2b);  \
    row2a  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    row2b  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    _mm_store_si128( 0+ (__m128i *)y, row4a);  \
    _mm_store_si128( 1+ (__m128i *)y, row4b);  \
    row4a  = _mm_set_epi64( (__m64)y[2], (__m64)y[1] );  \
    row4b  = _mm_set_epi64( (__m64)y[0], (__m64)y[3] );  \
    							 \

  round(0);
  round(1);
  round(2);
  round(3);
  round(4);
  round(5);
  round(6);
  round(7);
  round(8);
  round(9);
  round(10);
  round(11);
  round(12);
  round(13);

  row1a = _mm_xor_si128(row3a,row1a);
  row1b = _mm_xor_si128(row3b,row1b);
  _mm_store_si128(  (__m128i *)m, row1a);
  state->h[0] ^= m[ 0]; 
  state->h[1] ^= m[ 1];    
  _mm_store_si128(  (__m128i *)m, row1b);
  state->h[2] ^= m[ 0]; 
  state->h[3] ^= m[ 1];    

  row2a = _mm_xor_si128(row4a,row2a);
  row2b = _mm_xor_si128(row4b,row2b);
  _mm_store_si128(  (__m128i *)m, row2a);
  state->h[4] ^= m[ 0];    
  state->h[5] ^= m[ 1];    
  _mm_store_si128(  (__m128i *)m, row2b);
  state->h[6] ^= m[ 0];    
  state->h[7] ^= m[ 1];    

  return 0;
}
Пример #17
0
/* Computes alpha metrics */
void map_gen_alpha(map_gen_t * s, uint32_t long_cb)
{
  uint32_t k;
  int16_t *alpha = s->alpha;
  uint32_t i;

  alpha[0] = 0; 
  for (i = 1; i < 8; i++) {
    alpha[i] = -INF;
  }
  
  /* Define the shuffle constant for the positive alpha */
  __m128i shuf_ap = _mm_set_epi8(
    15, 14, // 7
    9,  8,  // 4
    7,  6,  // 3
    1,  0,  // 0
    13, 12, // 6
    11, 10, // 5
    5,  4,  // 2
    3,  2   // 1
  );

  /* Define the shuffle constant for the negative alpha */
  __m128i shuf_an = _mm_set_epi8(
    13, 12, // 6
    11, 10, // 5
    5,  4,  // 2
    3,  2,  // 1
    15, 14, // 7
    9,  8,  // 4
    7,  6,  // 3
    1,  0   // 0
  );
  
  /* Define shuffle for branch costs */
  __m128i shuf_g[4];
  shuf_g[0] = _mm_set_epi8(3,2,3,2,1,0,1,0,1,0,1,0,3,2,3,2);
  shuf_g[1] = _mm_set_epi8(7,6,7,6,5,4,5,4,5,4,5,4,7,6,7,6);
  shuf_g[2] = _mm_set_epi8(11,10,11,10,9,8,9,8,9,8,9,8,11,10,11,10);
  shuf_g[3] = _mm_set_epi8(15,14,15,14,13,12,13,12,13,12,13,12,15,14,15,14);

  __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
  
  __m128i* alphaPtr = (__m128i*) alpha;
  alphaPtr++;

  __m128i gv; 
  __m128i *gPtr = (__m128i*) s->branch;
  __m128i g, ap, an; 
    
  __m128i alpha_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
  
  /* This defines a alpha computation step: 
   * Adds and substracts the branch metrics to the previous alpha step, 
   * shuffles the states according to the trellis path and selects maximum state 
   */
#define ALPHA_STEP(c)  g = _mm_shuffle_epi8(gv, shuf_g[c]); \
  ap = _mm_add_epi16(alpha_k, g);\
  an = _mm_sub_epi16(alpha_k, g);\
  ap = _mm_shuffle_epi8(ap, shuf_ap);\
  an = _mm_shuffle_epi8(an, shuf_an);\
  alpha_k = _mm_max_epi16(ap, an);\
  _mm_store_si128(alphaPtr, alpha_k);\
  alphaPtr++;    \

  /* In this loop, we compute 8 steps and normalize twice for each branch metrics memory load */
  __m128i norm;
  for (k = 0; k < long_cb/8; k++) {
    gv = _mm_load_si128(gPtr);
    gPtr++;
    ALPHA_STEP(0);
    ALPHA_STEP(1);
    ALPHA_STEP(2);
    ALPHA_STEP(3);
    norm = _mm_shuffle_epi8(alpha_k, shuf_norm); 
    alpha_k = _mm_sub_epi16(alpha_k, norm);
    gv = _mm_load_si128(gPtr);
    gPtr++;
    ALPHA_STEP(0);
    ALPHA_STEP(1);
    ALPHA_STEP(2);
    ALPHA_STEP(3);
    norm = _mm_shuffle_epi8(alpha_k, shuf_norm); 
    alpha_k = _mm_sub_epi16(alpha_k, norm);
  }  
}
Пример #18
0
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags)
{
	if (!pcszText || !*pcszText) return;

	// Find out actual size of text
	int nChars = _tcslen(pcszText);
	uFlags |= DT_NOCLIP;

	int iX = rc.left;
	int iY = rc.top;
	int iXW = (rc.right - iX);
	int iYH = (rc.bottom - iY);

	RECT rcMin = rc;
	if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) {
		int iMinXW = rcMin.right - rcMin.left;
		int iMinYH = rcMin.bottom - rcMin.top;
		if (iMinXW < iXW) {
			if (uFlags & DT_CENTER) {
				iX += (iXW - iMinXW)/2;
				uFlags &= ~DT_CENTER;
			} else if (uFlags & DT_RIGHT) {
				iX += (iXW - iMinXW);
				uFlags &= ~DT_RIGHT;
			}
			iXW = iMinXW;
		}
		if (iMinYH < iYH) {
			if (uFlags & DT_SINGLELINE) {
				if (uFlags & DT_VCENTER) {
					iY += (iYH - iMinYH)/2;
					uFlags &= ~DT_VCENTER;
				} else if (uFlags & DT_BOTTOM) {
					iY += (iYH - iMinYH);
					uFlags &= ~DT_BOTTOM;
				}
			}
			iYH = iMinYH;
		}
	}

	iXW += 2;	// NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette!
	iYH += 2;

	// Ensure we have a big enough DIB to draw the text to
	if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH);
	if (!hbmpTextDIB) return;

	// Select color
	ieBGRA clr;
	switch (eColor) {
	case eFileName:	clr = clrFileName;		break;
	case eComment:	clr = clrComment;		break;
	case eFileInfo:	clr = clrFileInfo;		break;
	default:		clr = ieBGRA(0,0,0);	break;
	}
	clr.A = 0xFF - clrBkg.A;

	// Draw the text to in-memory DIB
	RECT rcTextDIB = { 0, 0, iXW, iYH };
	FillRect(hdcTextDIB, &rcTextDIB, hbrBkg);

	rcTextDIB.left++;
	rcTextDIB.top++;

	DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags);

	// Modify DIB:
#ifndef __X64__
	if (g_bSSE2) 
#endif
	{
		__m128i r0, r1, r2, r3, r4, r5, r6, r7;

		r7 = _mm_setzero_si128();									// 0
		r6 = _mm_set1_epi32(clr.dw);								// CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB
		r6 = _mm_unpacklo_epi8(r7, r6);								// CA<<8   CR<<8   CG<<8   CB<<8   CA<<8   CR<<8   CG<<8   CB<<8
		r5 = _mm_set1_epi16(1);										// 1       1       1       1       1       1       1       1
		r4 = _mm_set1_epi32(0xFF);									// FF              FF              FF              FF
		r3 = _mm_set1_epi32(clrBkg.dw);								// DA  0   0   0   DA  0   0   0   DA  0   0   0   DA  0   0   0

		ieBGRA *py = pTextDIB;
		for (int y = iYH; y--; py += iTextDIBXW) {
			ieBGRA *px = py;

			for (int x_4 = (iXW+3)>>2; x_4--; px += 4) {

				r0 = _mm_load_si128((__m128i *)px);
				r1 = r0;
				r2 = r0;											// X3  R3  G3  B3  X2  R2  G2  B2  X1  R1  G1  B1  X0  R0  G0  B0 
				r0 = _mm_srli_epi32(r0, 16);						// 0   0   X3  R3  0   0   X2  R2  0   0   X1  R1  0   0   X0  R0 
				r1 = _mm_srli_epi32(r1, 8);							// 0   X3  R3  G3  0   X2  R2  G2  0   X1  R1  G1  0   X0  R0  G0 
				r0 = _mm_max_epu8(r0, r2);
				r0 = _mm_max_epu8(r0, r1);							// x   x   x   A3  x   x   x   A2  x   x   x   A1  x   x   x   A0
				r0 = _mm_and_si128(r0, r4);							// 0       A3      0       A2      0       A1      0       A0
				r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0));
				r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0));	// A3      A3      A2      A2      A1      A1      A0      A0
				r1 = r0;
				r0 = _mm_unpacklo_epi32(r0, r0);					// A1      A1      A1      A1      A0      A0      A0      A0
				r1 = _mm_unpackhi_epi32(r1, r1);					// A3      A3      A3      A3      A2      A2      A2      A2
				r0 = _mm_add_epi16(r0, r5);							// A1'     A1'     A1'     A1'     A0'     A0'     A0'     A0' 
				r1 = _mm_add_epi16(r1, r5);							// A3'     A3'     A3'     A3'     A2'     A2'     A2'     A2' 
				r0 = _mm_mulhi_epu16(r0, r6);						// xA1"    xR1     xG1     xB1     xA0"    xR0     xG0     xB0
				r1 = _mm_mulhi_epu16(r1, r6);						// xA3"    xR3     xG3     xB3     xA2"    xR2     xG2     xB2
				r0 = _mm_packus_epi16(r0, r1);						// xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0
				r0 = _mm_adds_epu8(r0, r3);							// xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0
				_mm_store_si128((__m128i *)px, r0);
			}
		}
	}
#ifndef __X64__
	else {
Пример #19
0
/* Computes beta values */
void map_gen_beta(map_gen_t * s, int16_t * output, uint32_t long_cb)
{
  int k;
  uint32_t end = long_cb + 3;
  const __m128i *alphaPtr = (const __m128i*) s->alpha;
 
  __m128i beta_k = _mm_set_epi16(-INF, -INF, -INF, -INF, -INF, -INF, -INF, 0);
  __m128i g, bp, bn, alpha_k; 
  
  /* Define the shuffle constant for the positive beta */
  __m128i shuf_bp = _mm_set_epi8(
    15, 14, // 7
    7,  6,  // 3
    5,  4,  // 2
    13, 12, // 6
    11, 10, // 5
    3,  2,  // 1
    1,  0,  // 0
    9,  8   // 4
  );

  /* Define the shuffle constant for the negative beta */
  __m128i shuf_bn = _mm_set_epi8(
    7,   6, // 3
    15, 14, // 7
    13, 12, // 6
    5,  4,  // 2
    3,  2,  // 1
    11, 10, // 5
    9,  8,  // 4
    1,  0   // 0
  );
 
  alphaPtr += long_cb-1;

  /* Define shuffle for branch costs */
  __m128i shuf_g[4];  
  shuf_g[3] = _mm_set_epi8(3,2,1,0,1,0,3,2,3,2,1,0,1,0,3,2);
  shuf_g[2] = _mm_set_epi8(7,6,5,4,5,4,7,6,7,6,5,4,5,4,7,6);
  shuf_g[1] = _mm_set_epi8(11,10,9,8,9,8,11,10,11,10,9,8,9,8,11,10);
  shuf_g[0] = _mm_set_epi8(15,14,13,12,13,12,15,14,15,14,13,12,13,12,15,14);
  __m128i gv;
  int16_t *b = &s->branch[2*long_cb-8];
  __m128i *gPtr = (__m128i*) b;
  /* Define shuffle for beta normalization */
  __m128i shuf_norm = _mm_set_epi8(1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0);
  
  /* This defines a beta computation step: 
   * Adds and substracts the branch metrics to the previous beta step, 
   * shuffles the states according to the trellis path and selects maximum state 
   */
#define BETA_STEP(g)     bp = _mm_add_epi16(beta_k, g);\
    bn = _mm_sub_epi16(beta_k, g);\
    bp = _mm_shuffle_epi8(bp, shuf_bp);\
    bn = _mm_shuffle_epi8(bn, shuf_bn);\
    beta_k = _mm_max_epi16(bp, bn);    

    /* Loads the alpha metrics from memory and adds them to the temporal bn and bp 
     * metrics. Then computes horizontal maximum of both metrics and computes difference
     */
#define BETA_STEP_CNT(c,d) g = _mm_shuffle_epi8(gv, shuf_g[c]);\
    BETA_STEP(g)\
    alpha_k = _mm_load_si128(alphaPtr);\
    alphaPtr--;\
    bp = _mm_add_epi16(bp, alpha_k);\
    bn = _mm_add_epi16(bn, alpha_k); output[k-d] = hMax(bn) - hMax(bp);

  /* The tail does not require to load alpha or produce outputs. Only update 
   * beta metrics accordingly */
  for (k=end-1; k>=long_cb; k--) {
    int16_t g0 = s->branch[2*k];
    int16_t g1 = s->branch[2*k+1];
    g = _mm_set_epi16(g1, g0, g0, g1, g1, g0, g0, g1);
  
    BETA_STEP(g);
  }  
  
  /* We inline 2 trelis steps for each normalization */
  __m128i norm;
  for (; k >= 0; k-=8) {    
    gv = _mm_load_si128(gPtr);
    gPtr--;
    BETA_STEP_CNT(0,0);
    BETA_STEP_CNT(1,1);
    BETA_STEP_CNT(2,2);
    BETA_STEP_CNT(3,3);
    norm = _mm_shuffle_epi8(beta_k, shuf_norm); 
    beta_k = _mm_sub_epi16(beta_k, norm);
    gv = _mm_load_si128(gPtr);
    gPtr--;
    BETA_STEP_CNT(0,4);
    BETA_STEP_CNT(1,5);
    BETA_STEP_CNT(2,6);
    BETA_STEP_CNT(3,7);
    norm = _mm_shuffle_epi8(beta_k, shuf_norm); 
    beta_k = _mm_sub_epi16(beta_k, norm);
  }  
}
Пример #20
0
// @return true iff the two pages differ; false otherwise.
// @note Uses SSE3, so you must compile with -msse3.
bool pagesDifferent (const void * b1, const void * b2) {

  enum { PAGE_SIZE = 4096 };

  // Make a mask, initially all 1's.
  register __m128i mask = _mm_setzero_si128();
  mask = _mm_cmpeq_epi32(mask, mask); 


  __m128i * buf1 = (__m128i *) b1;
  __m128i * buf2 = (__m128i *) b2;

  // Some vectorizing pragamata here; not sure if gcc implements them.

#pragma vector always
  for (int i = 0; i < PAGE_SIZE / sizeof(__m128i); i += 8) {
#pragma ivdep
#pragma vector aligned

    register __m128i xmm1, xmm2;

    // Unrolled loop for speed: we load two 128-bit chunks,
    // and logically AND in their comparison.
    // If the mask gets any zero bits, the bytes differ.
    xmm1 = _mm_load_si128 (&buf1[i]);
    xmm2 = _mm_load_si128 (&buf2[i]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+1]);
    xmm2 = _mm_load_si128 (&buf2[i+1]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+2]);
    xmm2 = _mm_load_si128 (&buf2[i+2]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+3]);
    xmm2 = _mm_load_si128 (&buf2[i+3]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+4]);
    xmm2 = _mm_load_si128 (&buf2[i+4]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+5]);
    xmm2 = _mm_load_si128 (&buf2[i+5]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+6]);
    xmm2 = _mm_load_si128 (&buf2[i+6]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+7]);
    xmm2 = _mm_load_si128 (&buf2[i+7]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));

    // Save the mask to see whether we have found a difference or not.
    unsigned long long buf[128 / sizeof(unsigned long long) / 8]  __attribute__((aligned(16)));
    _mm_store_si128 ((__m128i *) &buf, mask);
    
    // IMPORTANT: make sure long long = 64bits!
    enum { VERIFY_LONGLONG_64 = 1 / (sizeof(long long) == 8) };

    // Now check the result.
    // Both buf[0] and buf[1] should be all ones.
    if ((buf[0] != (unsigned long long) -1) ||
	(buf[1] != (unsigned long long) -1)) {
      return true;
    }
  }

  // No differences found.
  return false;
}
void
lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                     const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
   unsigned y = (arg.triangle.plane_mask >> 8) + task->y;

   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = _mm_setzero_si128();

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;

   __m128i dcdx2;
   __m128i dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;

   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &unused);

   /* Adjust dcdx;
    */
   dcdx = _mm_sub_epi32(zero, dcdx);

   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));

   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
   c = _mm_sub_epi32(c, _mm_set1_epi32(1));

   dcdx2 = _mm_add_epi32(dcdx, dcdx);
   dcdx3 = _mm_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);


   {
      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
      
      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);

      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));

      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
      __m128i c_01 = _mm_packs_epi32(c_0, c_1);

      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));

      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);

      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));

      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);

      unsigned mask = _mm_movemask_epi8(c_0123);

      if (mask != 0xffff)
         lp_rast_shade_quads_mask(task,
                                  &tri->inputs,
                                  x,
                                  y,
                                  0xffff & ~mask);
   }
}
Пример #22
0
ConnectedComponent::ConnectedComponent(
		std::array<char, 8> value,
		boost::shared_ptr<pixel_list_type> pixelList,
		pixel_list_type::const_iterator begin,
		pixel_list_type::const_iterator end) :

	_pixels(pixelList),
	_value(value),
	_boundingBox(0, 0, 0, 0),
	_center(0, 0),
	_centerDirty(true),
	_pixelRange(begin, end),
	_bitmapDirty(true) {

#ifdef __SSE4_1__

	// if there is at least one pixel
	if (begin != end) {

		unsigned int*__restrict__ pixels    = (unsigned int*)&*begin;
		unsigned int*__restrict__ pixelsEnd = (unsigned int*)&*end;

		// Prepare aligned, packed integer values.
		typedef union {
			__m128i v;
			unsigned int a[4];
		} xmm_uints;

		enum {X1, Y1, X2, Y2};

		__attribute__((aligned(16))) xmm_uints mins1;
		__attribute__((aligned(16))) xmm_uints maxs1;

		mins1.a[X1] = begin->x();
		maxs1.a[X1] = begin->x();
		mins1.a[Y1] = begin->y();
		maxs1.a[Y1] = begin->y();

		// Iterate through pixelList until 16-byte alignment is reached.
		while (((std::uintptr_t) pixels % 16) != 0 && pixels < pixelsEnd) {

			unsigned int x = pixels[X1];
			unsigned int y = pixels[Y1];

			mins1.a[X1] = std::min(mins1.a[X1], x);
			mins1.a[Y1] = std::min(mins1.a[Y1], y);
			maxs1.a[X1] = std::max(maxs1.a[X1], x);
			maxs1.a[Y1] = std::max(maxs1.a[Y1], y);

			pixels += 2;
		}

		// Guaranteed to have at least 8 XMM registers, so use 4 for cumulative
		// values and 2 for vector values. (Using 8+4 of 16 registers on 64-bit
		// arch yields no performance improvement.)
		mins1.a[X2] = mins1.a[X1];
		mins1.a[Y2] = mins1.a[Y1];
		maxs1.a[X2] = maxs1.a[X1];
		maxs1.a[Y2] = maxs1.a[Y1];
		__m128i mins2 = mins1.v;
		__m128i maxs2 = maxs1.v;

		// Vectorized loop. Strides two packed integer vectors, each containing
		// both X and Y for two pixels.
		while (pixels < pixelsEnd - 8) {

			__m128i pixelPair1 = _mm_load_si128((__m128i*)pixels);
			__m128i pixelPair2 = _mm_load_si128((__m128i*)(pixels + 4));
			pixels += 8; // Hint compiler to iterate while loads stall.
			_mm_prefetch(pixels, _MM_HINT_T0);
			mins1.v = _mm_min_epu32(mins1.v, pixelPair1);
			maxs1.v = _mm_max_epu32(maxs1.v, pixelPair1);
			mins2   = _mm_min_epu32(mins2,   pixelPair2);
			maxs2   = _mm_max_epu32(maxs2,   pixelPair2);
		}

		// Combine stride results.
		mins1.v = _mm_min_epu32(mins1.v, mins2);
		maxs1.v = _mm_max_epu32(maxs1.v, maxs2);

		// Iterate through any remaining pixels.
		while (pixels < pixelsEnd) {

			unsigned int x = pixels[X1];
			unsigned int y = pixels[Y1];

			mins1.a[X1] = std::min(mins1.a[X1], x);
			mins1.a[Y1] = std::min(mins1.a[Y1], y);
			maxs1.a[X1] = std::max(maxs1.a[X1], x);
			maxs1.a[Y1] = std::max(maxs1.a[Y1], y);

			pixels += 2;
		}

		// Readout packed vectors, compare with remaining results, and store.
		_boundingBox.min().x() = (int)std::min(mins1.a[X1], mins1.a[X2]);
		_boundingBox.min().y() = (int)std::min(mins1.a[Y1], mins1.a[Y2]);
		_boundingBox.max().x() = (int)std::max(maxs1.a[X1], maxs1.a[X2]) + 1;
		_boundingBox.max().y() = (int)std::max(maxs1.a[Y1], maxs1.a[Y2]) + 1;
	}
Пример #23
0
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep,
		BYTE *pDst, int dstStep, const prim_size_t *roi)
{
	int lastRow, lastCol;
	BYTE *UData,*VData,*YData;
	int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV;
	__m128i r0,r1,r2,r3,r4,r5,r6,r7;
	__m128i *buffer;
	
	/* last_line: if the last (U,V doubled) line should be skipped, set to 10B
	 * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */

	buffer = _aligned_malloc(4 * 16, 16);
	
	YData = (BYTE*) pSrc[0];
	UData = (BYTE*) pSrc[1];
	VData = (BYTE*) pSrc[2];
	
	nWidth = roi->width;
	nHeight = roi->height;
	
	if ((lastCol = (nWidth & 3)))
	{
		switch (lastCol)
		{
			case 1:
				r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF);
				break;

			case 2:
				r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF);
				break;

			case 3:
				r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);
				break;
		}

		_mm_store_si128(buffer+3,r7);
		lastCol = 1;
	}
	
	nWidth += 3;
	nWidth = nWidth >> 2;
	
	lastRow = nHeight & 1;
	nHeight++;
	nHeight = nHeight >> 1;
	
	VaddDst = (dstStep << 1) - (nWidth << 4);
	VaddY = (srcStep[0] << 1) - (nWidth << 2);
	VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC);
	VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC);
	
	while (nHeight-- > 0)
	{
		if (nHeight == 0)
			lastRow <<= 1;

		i = 0;
		
		do
		{
			if (!(i & 0x01))
			{
			/* Y-, U- and V-data is stored in different arrays.
			* We start with processing U-data.
			*
			* at first we fetch four U-values from its array and shuffle them like this:
			*	0d0d 0c0c 0b0b 0a0a
			* we've done two things: converting the values to signed words and duplicating
			* each value, because always two pixel "share" the same U- (and V-) data */
				r0 = _mm_cvtsi32_si128(*(UINT32 *)UData);
				r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000);
				r0 = _mm_shuffle_epi8(r0,r5);
				
				UData += 4;
				
			/* then we subtract 128 from each value, so we get D */
				r3 = _mm_set_epi16(128,128,128,128,128,128,128,128);
				r0 = _mm_subs_epi16(r0,r3);
				
			/* we need to do two things with our D, so let's store it for later use */
				r2 = r0;
				
			/* now we can multiply our D with 48 and unpack it to xmm4:xmm0
			 * this is what we need to get G data later on */
				r4 = r0;
				r7 = _mm_set_epi16(48,48,48,48,48,48,48,48);
				r0 = _mm_mullo_epi16(r0,r7);
				r4 = _mm_mulhi_epi16(r4,r7);
				r7 = r0;
				r0 = _mm_unpacklo_epi16(r0,r4);
				r4 = _mm_unpackhi_epi16(r7,r4);
				
			/* to get B data, we need to prepare a second value, D*475 */
				r1 = r2;
				r7 = _mm_set_epi16(475,475,475,475,475,475,475,475);
				r1 = _mm_mullo_epi16(r1,r7);
				r2 = _mm_mulhi_epi16(r2,r7);
				r7 = r1;
				r1 = _mm_unpacklo_epi16(r1,r2);
				r7 = _mm_unpackhi_epi16(r7,r2);
				
			/* so we got something like this: xmm7:xmm1
			 * this pair contains values for 16 pixel:
			 * aabbccdd
			 * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */
				_mm_store_si128(buffer+1,r7);
				
			/* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */
				r2 = _mm_cvtsi32_si128(*(UINT32 *)VData);
				r2 = _mm_shuffle_epi8(r2,r5);
				
				VData += 4;
				
				r2 = _mm_subs_epi16(r2,r3);
				
				r5 = r2;
				
			/* this is also known as E*403, we need it to convert R data */
				r3 = r2;
				r7 = _mm_set_epi16(403,403,403,403,403,403,403,403);
				r2 = _mm_mullo_epi16(r2,r7);
				r3 = _mm_mulhi_epi16(r3,r7);
				r7 = r2;
				r2 = _mm_unpacklo_epi16(r2,r3);
				r7 = _mm_unpackhi_epi16(r7,r3);
				
			/* and preserve upper four values for future ... */
				_mm_store_si128(buffer+2,r7);
				
			/* doing this step: E*120 */
				r3 = r5;
				r7 = _mm_set_epi16(120,120,120,120,120,120,120,120);
				r3 = _mm_mullo_epi16(r3,r7);
				r5 = _mm_mulhi_epi16(r5,r7);
				r7 = r3;
				r3 = _mm_unpacklo_epi16(r3,r5);
				r7 = _mm_unpackhi_epi16(r7,r5);
				
			/* now we complete what we've begun above:
			 * (48*D) + (120*E) = (48*D +120*E) */
				r0 = _mm_add_epi32(r0,r3);
				r4 = _mm_add_epi32(r4,r7);
				
			/* and store to memory ! */
				_mm_store_si128(buffer,r4);
			}
			else
			{
			/* maybe you've wondered about the conditional above ?
			 * Well, we prepared UV data for eight pixel in each line, but can only process four
			 * per loop. So we need to load the upper four pixel data from memory each secound loop! */
				r1 = _mm_load_si128(buffer+1);
				r2 = _mm_load_si128(buffer+2);
				r0 = _mm_load_si128(buffer);
			}
			
			if (++i == nWidth)
				lastCol <<= 1;
			
		/* We didn't produce any output yet, so let's do so!
		 * Ok, fetch four pixel from the Y-data array and shuffle them like this:
		 * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */
			r4 = _mm_cvtsi32_si128(*(UINT32 *)YData);
			r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
			r4 = _mm_shuffle_epi8(r4,r7);
			
			r5 = r4;
			r6 = r4;
			
		/* no we can perform the "real" conversion itself and produce output! */
			r4 = _mm_add_epi32(r4,r2);
			r5 = _mm_sub_epi32(r5,r0);
			r6 = _mm_add_epi32(r6,r1);
			
		/* in the end, we only need bytes for RGB values.
		 * So, what do we do? right! shifting left makes values bigger and thats always good.
		 * before we had dwords of data, and by shifting left and treating the result
		 * as packed words, we get not only signed words, but do also divide by 256
		 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
		 * significant byte, that we don't need anymore, because we've done some rounding */
			r4 = _mm_slli_epi32(r4,8);
			r5 = _mm_slli_epi32(r5,8);
			r6 = _mm_slli_epi32(r6,8);
			
		/* one thing we still have to face is the clip() function ...
		 * we have still signed words, and there are those min/max instructions in SSE2 ...
		 * the max instruction takes always the bigger of the two operands and stores it in the first one,
		 * and it operates with signs !
		 * if we feed it with our values and zeros, it takes the zeros if our values are smaller than
		 * zero and otherwise our values */
			r7 = _mm_set_epi32(0,0,0,0);
			r4 = _mm_max_epi16(r4,r7);
			r5 = _mm_max_epi16(r5,r7);
			r6 = _mm_max_epi16(r6,r7);
			
		/* the same thing just completely different can be used to limit our values to 255,
		 * but now using the min instruction and 255s */
			r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
			r4 = _mm_min_epi16(r4,r7);
			r5 = _mm_min_epi16(r5,r7);
			r6 = _mm_min_epi16(r6,r7);
			
		/* Now we got our bytes.
		 * the moment has come to assemble the three channels R,G and B to the xrgb dwords
		 * on Red channel we just have to and each futural dword with 00FF0000H */
			//r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
			r4 = _mm_and_si128(r4,r7);
			
		/* on Green channel we have to shuffle somehow, so we get something like this:
		 * 00d0 00c0 00b0 00a0 */
			r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
			r5 = _mm_shuffle_epi8(r5,r7);
			
		/* and on Blue channel that one:
		 * 000d 000c 000b 000a */
			r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
			r6 = _mm_shuffle_epi8(r6,r7);
			
		/* and at last we or it together and get this one:
		 * xrgb xrgb xrgb xrgb */
			r4 = _mm_or_si128(r4,r5);
			r4 = _mm_or_si128(r4,r6);
			
		/* Only thing to do know is writing data to memory, but this gets a bit more
		 * complicated if the width is not a multiple of four and it is the last column in line. */
			if (lastCol & 0x02)
			{
			/* let's say, we need to only convert six pixel in width
			 * Ok, the first 4 pixel will be converted just like every 4 pixel else, but
			 * if it's the last loop in line, last_column is shifted left by one (curious? have a look above),
			 * and we land here. Through initialisation a mask was prepared. In this case it looks like
			 * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */
				r6 = _mm_load_si128(buffer+3);
			/* we and our output data with this mask to get only the valid pixel */
				r4 = _mm_and_si128(r4,r6);
			/* then we fetch memory from the destination array ... */
				r5 = _mm_lddqu_si128((__m128i *)pDst);
			/* ... and and it with the inverse mask. We get only those pixel, which should not be updated */
				r6 = _mm_andnot_si128(r6,r5);
			/* we only have to or the two values together and write it back to the destination array,
			 * and only the pixel that should be updated really get changed. */
				r4 = _mm_or_si128(r4,r6);
			}
			_mm_storeu_si128((__m128i *)pDst,r4);
			
			if (!(lastRow & 0x02))
			{
			/* Because UV data is the same for two lines, we can process the secound line just here,
			 * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
			 * pointer. These offsets are iStride[0] and the target scanline.
			 * But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
			 * we just skip all this. */
				r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0]));
				r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080);
				r4 = _mm_shuffle_epi8(r4,r7);
				
				r5 = r4;
				r6 = r4;
				
				r4 = _mm_add_epi32(r4,r2);
				r5 = _mm_sub_epi32(r5,r0);
				r6 = _mm_add_epi32(r6,r1);
				
				r4 = _mm_slli_epi32(r4,8);
				r5 = _mm_slli_epi32(r5,8);
				r6 = _mm_slli_epi32(r6,8);
				
				r7 = _mm_set_epi32(0,0,0,0);
				r4 = _mm_max_epi16(r4,r7);
				r5 = _mm_max_epi16(r5,r7);
				r6 = _mm_max_epi16(r6,r7);
				
				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
				r4 = _mm_min_epi16(r4,r7);
				r5 = _mm_min_epi16(r5,r7);
				r6 = _mm_min_epi16(r6,r7);
				
				r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000);
				r4 = _mm_and_si128(r4,r7);
				
				r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280);
				r5 = _mm_shuffle_epi8(r5,r7);
				
				r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002);
				r6 = _mm_shuffle_epi8(r6,r7);
				
				r4 = _mm_or_si128(r4,r5);
				r4 = _mm_or_si128(r4,r6);
				
				if (lastCol & 0x02)
				{
					r6 = _mm_load_si128(buffer+3);
					r4 = _mm_and_si128(r4,r6);
					r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep));
					r6 = _mm_andnot_si128(r6,r5);
					r4 = _mm_or_si128(r4,r6);
					
				/* only thing is, we should shift [rbp-42] back here, because we have processed the last column,
				 * and this "special condition" can be released */
					lastCol >>= 1;
				}
				_mm_storeu_si128((__m128i *)(pDst+dstStep),r4);
			}
			
		/* after all we have to increase the destination- and Y-data pointer by four pixel */
			pDst += 16;
			YData += 4;
		}
Пример #24
0
FOLLY_DISABLE_SANITIZERS __m128i _mm_load_si128_nosan(__m128i const* const p) {
  return _mm_load_si128(p);
}
Пример #25
0
inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
{
    v = _mm_load_si128(idx);
    n = _mm_cvtepi32_ps(v);
}
Пример #26
0
static inline int blake512_compress( state * state, const u8 * datablock ) 
{

  __m128i row1l;
  __m128i row2l;
  __m128i row3l;
  __m128i row4l;
  u64 row1hl, row1hh;
  u64 row2hl, row2hh;
  u64 row3hl, row3hh;
  u64 row4hl, row4hh;

  const __m128i r16 = _mm_setr_epi8(2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9);
  const __m128i u8to64 = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7);

  union
  {
    __m128i u128[8];
    u64     u64[16];
  } m;

  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
  u64     u0, u1, u2, u3;
  __m128i b0;
  u64 b1l, b1h;

  m.u128[0] = _mm_loadu_si128((__m128i*)(datablock +   0));
  m.u128[1] = _mm_loadu_si128((__m128i*)(datablock +  16));
  m.u128[2] = _mm_loadu_si128((__m128i*)(datablock +  32));
  m.u128[3] = _mm_loadu_si128((__m128i*)(datablock +  48));
  m.u128[4] = _mm_loadu_si128((__m128i*)(datablock +  64));
  m.u128[5] = _mm_loadu_si128((__m128i*)(datablock +  80));
  m.u128[6] = _mm_loadu_si128((__m128i*)(datablock +  96));
  m.u128[7] = _mm_loadu_si128((__m128i*)(datablock + 112));

  m.u128[0] = BSWAP64(m.u128[0]);
  m.u128[1] = BSWAP64(m.u128[1]);
  m.u128[2] = BSWAP64(m.u128[2]);
  m.u128[3] = BSWAP64(m.u128[3]);
  m.u128[4] = BSWAP64(m.u128[4]);
  m.u128[5] = BSWAP64(m.u128[5]);
  m.u128[6] = BSWAP64(m.u128[6]);
  m.u128[7] = BSWAP64(m.u128[7]);

  row1l = _mm_load_si128((__m128i*)&state->h[0]);
  row1hl = state->h[2];
  row1hh = state->h[3];

  row2l = _mm_load_si128((__m128i*)&state->h[4]);
  row2hl = state->h[6];
  row2hh = state->h[7];

  row3l = _mm_set_epi64x(0x13198A2E03707344ULL, 0x243F6A8885A308D3ULL);
  row3hl = 0xA4093822299F31D0ULL;
  row3hh = 0x082EFA98EC4E6C89ULL;

  row4l = _mm_set_epi64x(0xBE5466CF34E90C6CULL, 0x452821E638D01377ULL);
  row4hl = 0xC0AC29B7C97C50DDULL;
  row4hh = 0x3F84D5B5B5470917ULL;

  if(!state->nullt)
  {
  	row4l = _mm_xor_si128(row4l, _mm_set1_epi64x(state->t[0]));
    row4hl ^= state->t[1];
    row4hh ^= state->t[1];
  }

  ROUND( 0);
  ROUND( 1);
  ROUND( 2);
  ROUND( 3);
  ROUND( 4);
  ROUND( 5);
  ROUND( 6);
  ROUND( 7);
  ROUND( 8);
  ROUND( 9);
  ROUND(10);
  ROUND(11);
  ROUND(12);
  ROUND(13);
  ROUND(14);
  ROUND(15);

  row1l = _mm_xor_si128(row3l,row1l);
  row1hl ^= row3hl;
  row1hh ^= row3hh;

  _mm_store_si128((__m128i*)&state->h[0], _mm_xor_si128(row1l, _mm_load_si128((__m128i*)&state->h[0])));
  state->h[2] ^= row1hl;
  state->h[3] ^= row1hh;

  row2l = _mm_xor_si128(row4l,row2l);
  row2hl ^= row4hl;
  row2hh ^= row4hh;

  _mm_store_si128((__m128i*)&state->h[4], _mm_xor_si128(row2l, _mm_load_si128((__m128i*)&state->h[4])));
  state->h[6] ^= row2hl;
  state->h[7] ^= row2hh;
  
  return 0;
}
Пример #27
0
void fb_slvn_low(dig_t *c, const dig_t *a) {
	int i;
	dig_t *p, u0, u1, u2, u3;
	void *tab = fb_poly_get_slv();
	__m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm;

	perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200);
	mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
	mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0);
	mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);
	sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100);
	sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400);

	t0 = _mm_load_si128((__m128i *)a);
	t1 = _mm_load_si128((__m128i *)(a + 2));
	r0 = r1 = _mm_setzero_si128();

	m0 = _mm_shuffle_epi8(t1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);

	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_and_si128(m1, mask2);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m0 = _mm_and_si128(t0, mask2);
	m0 = _mm_shuffle_epi8(m0, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);

	m2 = _mm_srli_si128(m1, 8);
	m1 = _mm_andnot_si128(mask2, m1);
	m2 = _mm_slli_epi64(m2, 4);
	m1 = _mm_xor_si128(m1, m2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 4);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF));
	m0 = _mm_shuffle_epi8(m1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);
	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	m1 = _mm_srli_si128(m1, 6);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 2);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF));
	m0 = _mm_shuffle_epi8(m1, perm);
	m1 = _mm_and_si128(m0, mask0);
	m2 = _mm_and_si128(m0, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m1 = _mm_xor_si128(m1, m2);
	m2 = _mm_slli_si128(m1, 8);
	m1 = _mm_slli_epi64(m1, 4);
	m1 = _mm_xor_si128(m1, m2);
	m1 = _mm_srli_si128(m1, 7);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_si128(t0, 1);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F));
	m1 = _mm_slli_epi64(m1, 4);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_epi64(t0, 4);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5));
	m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1));
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3));
	m1 = _mm_slli_epi64(m1, 2);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	m1 = _mm_srli_epi64(t0, 2);
	m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1));
	m1 = _mm_slli_epi64(m1, 1);
	t0 = _mm_xor_si128(t0, m1);
	r0 = _mm_xor_si128(r0, m1);

	sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000);
	sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000);

	m1 = _mm_and_si128(t0, mask0);
	m2 = _mm_and_si128(t0, mask1);
	m3 = _mm_and_si128(t1, mask0);
	m4 = _mm_and_si128(t1, mask1);
	m2 = _mm_srli_epi64(m2, 4);
	m4 = _mm_srli_epi64(m4, 4);
	m2 = _mm_shuffle_epi8(sqrt1, m2);
	m1 = _mm_shuffle_epi8(sqrt0, m1);
	m4 = _mm_shuffle_epi8(sqrt1, m4);
	m3 = _mm_shuffle_epi8(sqrt0, m3);
	m1 = _mm_or_si128(m1, m2);
	m3 = _mm_or_si128(m3, m4);
#ifndef __PCLMUL__
	align dig_t x[2];
	_mm_store_si128((__m128i *)x, m1);
	u0 = x[0];
	u1 = x[1];
	_mm_store_si128((__m128i *)x, m3);
	u2 = x[0];
	u3 = x[1];
#else
	u0 = _mm_extract_epi64(m1, 0);
	u1 = _mm_extract_epi64(m1, 1);
	u2 = _mm_extract_epi64(m3, 0);
	u3 = _mm_extract_epi64(m3, 1);
#endif

	for (i = 0; i < 8; i++) {
		p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u0 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u1 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u2 >>= 8;
		p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st));
		r0 = _mm_xor_si128(r0, *(__m128i *)(p));
		r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2));
		u3 >>= 8;
	}

	_mm_store_si128((__m128i *)c, r0);
	_mm_store_si128((__m128i *)(c + 2), r1);
}
Пример #28
0
void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
                          int stride) {
  __m128i in0, in1, in2, in3;
  __m128i u0, u1;
  __m128i sum = _mm_setzero_si128();
  int i;

  for (i = 0; i < 8; ++i) {
    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));

    input += stride;
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));

    input += stride;
    sum = _mm_add_epi16(sum, u1);
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));

    input += stride;
    sum = _mm_add_epi16(sum, u1);
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    in0 = _mm_load_si128((const __m128i *)(input + 0));
    in1 = _mm_load_si128((const __m128i *)(input + 8));
    in2 = _mm_load_si128((const __m128i *)(input + 16));
    in3 = _mm_load_si128((const __m128i *)(input + 24));

    input += stride;
    sum = _mm_add_epi16(sum, u1);
    u0 = _mm_add_epi16(in0, in1);
    u1 = _mm_add_epi16(in2, in3);
    sum = _mm_add_epi16(sum, u0);

    sum = _mm_add_epi16(sum, u1);
  }

  u0 = _mm_setzero_si128();
  in0 = _mm_unpacklo_epi16(u0, sum);
  in1 = _mm_unpackhi_epi16(u0, sum);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(sum, u0);
  in1 = _mm_unpackhi_epi32(sum, u0);

  sum = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(sum, 8);

  in1 = _mm_add_epi32(sum, in0);
  in1 = _mm_srai_epi32(in1, 3);
  store_output(&in1, output);
}
Пример #29
0
rfx_dwt_2d_decode_block_vert_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
{
	int x, n;
	INT16* l_ptr = l;
	INT16* h_ptr = h;
	INT16* dst_ptr = dst;
	__m128i l_n;
	__m128i h_n;
	__m128i tmp_n;
	__m128i h_n_m;
	__m128i dst_n;
	__m128i dst_n_m;
	__m128i dst_n_p;
	
	int total_width = subband_width + subband_width;

	/* Even coefficients */
	for (n = 0; n < subband_width; n++)
	{
		for (x = 0; x < total_width; x+=8)
		{
			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
			
			l_n = _mm_load_si128((__m128i*) l_ptr);
			h_n = _mm_load_si128((__m128i*) h_ptr);
			
			tmp_n = _mm_add_epi16(h_n, _mm_set1_epi16(1));;
			if (n == 0)
				tmp_n = _mm_add_epi16(tmp_n, h_n);
			else
			{
				h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - total_width));
				tmp_n = _mm_add_epi16(tmp_n, h_n_m);
			}
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			dst_n = _mm_sub_epi16(l_n, tmp_n);
			_mm_store_si128((__m128i*) dst_ptr, dst_n);
			
			l_ptr+=8;
			h_ptr+=8;
			dst_ptr+=8;
		}
		dst_ptr+=total_width;
	}
	
	h_ptr = h;
	dst_ptr = dst + total_width;
	
	/* Odd coefficients */
	for (n = 0; n < subband_width; n++)
	{
		for (x = 0; x < total_width; x+=8)
		{
			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
			
			h_n = _mm_load_si128((__m128i*) h_ptr);
			dst_n_m = _mm_load_si128((__m128i*) (dst_ptr - total_width));
			h_n = _mm_slli_epi16(h_n, 1);
			
			tmp_n = dst_n_m;
			if (n == subband_width - 1)
				tmp_n = _mm_add_epi16(tmp_n, dst_n_m);
			else
			{
				dst_n_p = _mm_loadu_si128((__m128i*) (dst_ptr + total_width));
				tmp_n = _mm_add_epi16(tmp_n, dst_n_p);
			}
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			
			dst_n = _mm_add_epi16(tmp_n, h_n);
			_mm_store_si128((__m128i*) dst_ptr, dst_n);

			h_ptr+=8;
			dst_ptr+=8;
		}
		dst_ptr+=total_width;
	}
}
Пример #30
0
void
png_read_filter_row_avg3_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   png_bytep rp = row;
   png_const_bytep prp = prev_row;
   __m128i nrb = _mm_load_si128((__m128i*)(rp));
   __m128i pixel = _mm_setzero_si128();
   const __m128i mask = _mm_set1_epi8(0x01);

   for (i = 0; i < row_info->rowbytes; i += 15, rp += 15, prp += 15)
   {
#ifndef __SSSE3__
      __m128i prb = _mm_loadu_si128((__m128i*)prp);
#else
      __m128i prb = _mm_lddqu_si128((__m128i*)prp);
#endif
      __m128i rb = nrb;

      // First pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Second pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Third pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Fourth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 3);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      // Fifth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
#ifndef __SSSE3__
      nrb = _mm_loadu_si128((__m128i*)(rp + 15));
      rb = _mm_srli_si128(rb, 3);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13));
#else
      nrb = _mm_lddqu_si128((__m128i*)(rp + 15));
      rb = _mm_alignr_epi8(pixel, rb, 3);
#endif

      rb = _mm_srli_si128(rb, 1);
      _mm_storeu_si128((__m128i*)rp, rb);
   }
}