Example #1
0
void sub_ssememcpy(void* _Dst, const void* _Src, size_t size)
{
	assert(IS_16BYTE_ALIGNMENT(_Dst));
	assert(IS_16BYTE_ALIGNMENT(_Src));

	float* dst = (float*)_Dst;
	float* src = (float*)_Src;
	int loop_num = size >> 6;
	for (int i = 0; i < loop_num; i++) {
		//load 64byte data
		__m128 xmm0 = _mm_load_ps(src + 0);
		__m128 xmm1 = _mm_load_ps(src + 4);
		__m128 xmm2 = _mm_load_ps(src + 8);
		__m128 xmm3 = _mm_load_ps(src + 12);
		//store 64byte data
		//_mm_store_ps(dst + 0, xmm0);
		//_mm_store_ps(dst + 4, xmm1);
		//_mm_store_ps(dst + 8, xmm2);
		//_mm_store_ps(dst + 12, xmm3);
		_mm_stream_si128((__m128i*)(dst + 0), _mm_castps_si128(xmm0));
		_mm_stream_si128((__m128i*)(dst + 4), _mm_castps_si128(xmm1));
		_mm_stream_si128((__m128i*)(dst + 8), _mm_castps_si128(xmm2));
		_mm_stream_si128((__m128i*)(dst + 12), _mm_castps_si128(xmm3));

		dst += 16;
		src += 16;
	}

	memcpy(dst, src, size & 0x3F);
}
Example #2
0
	void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows)
	{
		const u32 num_pixels = row_length_in_texels * num_rows;
		verify(HERE), (num_pixels & 3) == 0;

		const auto num_iterations = (num_pixels >> 2);

		__m128i* dst_ptr = (__m128i*)dst;
		__m128i* src_ptr = (__m128i*)src;

		const __m128 scale_vector = _mm_set1_ps(16777214.f);

#if defined (_MSC_VER) || defined (__SSSE3__)
		if (LIKELY(utils::has_ssse3()))
		{
			const __m128i swap_mask = _mm_set_epi8
			(
				0xF, 0xC, 0xD, 0xE,
				0xB, 0x8, 0x9, 0xA,
				0x7, 0x4, 0x5, 0x6,
				0x3, 0x0, 0x1, 0x2
			);

			for (u32 n = 0; n < num_iterations; ++n)
			{
				const __m128i src_vector = _mm_loadu_si128(src_ptr);
				const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector));
				const __m128i shuffled_vector = _mm_shuffle_epi8(result, swap_mask);
				_mm_stream_si128(dst_ptr, shuffled_vector);
				++dst_ptr;
				++src_ptr;
			}

			return;
		}
#endif

		const __m128i mask1 = _mm_set1_epi32(0xFF00FF00);
		const __m128i mask2 = _mm_set1_epi32(0x00FF0000);
		const __m128i mask3 = _mm_set1_epi32(0x000000FF);

		for (u32 n = 0; n < num_iterations; ++n)
		{
			const __m128i src_vector = _mm_loadu_si128(src_ptr);
			const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector));

			const __m128i v1 = _mm_and_si128(result, mask1);
			const __m128i v2 = _mm_and_si128(_mm_slli_epi32(result, 16), mask2);
			const __m128i v3 = _mm_and_si128(_mm_srli_epi32(result, 16), mask3);
			const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3);

			_mm_stream_si128(dst_ptr, shuffled_vector);
			++dst_ptr;
			++src_ptr;
		}
	}
Example #3
0
void CopyBuffer( byte * dst, const byte * src, int numBytes ) {
	assert_16_byte_aligned( dst );
	assert_16_byte_aligned( src );

	int i = 0;
	for ( ; i + 128 <= numBytes; i += 128 ) {
		__m128i d0 = _mm_load_si128( (__m128i *)&src[i + 0*16] );
		__m128i d1 = _mm_load_si128( (__m128i *)&src[i + 1*16] );
		__m128i d2 = _mm_load_si128( (__m128i *)&src[i + 2*16] );
		__m128i d3 = _mm_load_si128( (__m128i *)&src[i + 3*16] );
		__m128i d4 = _mm_load_si128( (__m128i *)&src[i + 4*16] );
		__m128i d5 = _mm_load_si128( (__m128i *)&src[i + 5*16] );
		__m128i d6 = _mm_load_si128( (__m128i *)&src[i + 6*16] );
		__m128i d7 = _mm_load_si128( (__m128i *)&src[i + 7*16] );
		_mm_stream_si128( (__m128i *)&dst[i + 0*16], d0 );
		_mm_stream_si128( (__m128i *)&dst[i + 1*16], d1 );
		_mm_stream_si128( (__m128i *)&dst[i + 2*16], d2 );
		_mm_stream_si128( (__m128i *)&dst[i + 3*16], d3 );
		_mm_stream_si128( (__m128i *)&dst[i + 4*16], d4 );
		_mm_stream_si128( (__m128i *)&dst[i + 5*16], d5 );
		_mm_stream_si128( (__m128i *)&dst[i + 6*16], d6 );
		_mm_stream_si128( (__m128i *)&dst[i + 7*16], d7 );
	}
	for ( ; i + 16 <= numBytes; i += 16 ) {
		__m128i d = _mm_load_si128( (__m128i *)&src[i] );
		_mm_stream_si128( (__m128i *)&dst[i], d );
	}
	for ( ; i + 4 <= numBytes; i += 4 ) {
		*(uint32 *)&dst[i] = *(const uint32 *)&src[i];
	}
	for ( ; i < numBytes; i++ ) {
		dst[i] = src[i];
	}
	_mm_sfence();
}
Example #4
0
void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
{
	size_t i = 0;
	
#ifdef ENABLE_SSE2
	const __m128i attrDepth_vec128				= _mm_set1_epi32(attr.depth);
	const __m128i attrOpaquePolyID_vec128		= _mm_set1_epi8(attr.opaquePolyID);
	const __m128i attrTranslucentPolyID_vec128	= _mm_set1_epi8(attr.translucentPolyID);
	const __m128i attrStencil_vec128			= _mm_set1_epi8(attr.stencil);
	const __m128i attrIsFogged_vec128			= _mm_set1_epi8(attr.isFogged);
	const __m128i attrIsTranslucentPoly_vec128	= _mm_set1_epi8(attr.isTranslucentPoly);
	
	const size_t sseCount = count - (count % 16);
	for (; i < sseCount; i += 16)
	{
		_mm_stream_si128((__m128i *)(this->depth +  0), attrDepth_vec128);
		_mm_stream_si128((__m128i *)(this->depth +  4), attrDepth_vec128);
		_mm_stream_si128((__m128i *)(this->depth +  8), attrDepth_vec128);
		_mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128);
		
		_mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128);
		_mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128);
		_mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128);
		_mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
		_mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
	}
#endif
	
	for (; i < count; i++)
	{
		this->SetAtIndex(i, attr);
	}
}
Example #5
0
/*
=====================
R_CopyDecalSurface
=====================
*/
static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes,
									const decal_t * decal, const float fadeColor[4] ) {
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( decal->indexes );
	assert_16_byte_aligned( decal->verts );
	assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	assert_16_byte_aligned( fadeColor );


	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	const __m128 vector_fade_color = _mm_load_ps( fadeColor );
	const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 );

	// copy vertices and apply depth/time based fading
	assert_offsetof( idDrawVert, color, 6 * 4 );
	for ( int i = 0; i < decal->numVerts; i++ ) {
		const idDrawVert &srcVert = decal->verts[i];
		idDrawVert &dstVert = verts[numVerts + i];

		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
		__m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 );

		__m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color );
		__m128i colorInt = _mm_cvtps_epi32( timeDepthFade );
		__m128i colorShort = _mm_packs_epi32( colorInt, colorInt );
		__m128i colorByte = _mm_packus_epi16( colorShort, colorShort );
		v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) );

		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
	}

	// copy indexes
	assert( ( decal->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for ( int i = 0; i < decal->numIndexes; i += 8 ) {
		__m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] );

		vi = _mm_add_epi16( vi, vector_short_num_verts );

		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
	}

	_mm_sfence();

}
Example #6
0
void simd_memcpy(void *dst, void *src, size_t nbytes)
{
    size_t i;

    size_t ilen = nbytes/sizeof(int);
    size_t ilen_sm = ilen - ilen%16;

    char *cdst=(char*)dst;
    char *csrc=(char*)src;

    int * idst=(int*)dst;
    int * isrc=(int*)src;

    __m128i l0,l1,l2,l3;

    _mm_prefetch((__m128i*)&isrc[0], _MM_HINT_NTA);
    _mm_prefetch((__m128i*)&isrc[4], _MM_HINT_NTA);
    _mm_prefetch((__m128i*)&isrc[8], _MM_HINT_NTA);
    _mm_prefetch((__m128i*)&isrc[12], _MM_HINT_NTA);

    for(i=0; i<ilen_sm; i+=16)
    {
        l0 =  _mm_load_si128((__m128i*)&isrc[i+0]);
        l1 =  _mm_load_si128((__m128i*)&isrc[i+4]);
        l2 =  _mm_load_si128((__m128i*)&isrc[i+8]);
        l3 =  _mm_load_si128((__m128i*)&isrc[i+12]);

        _mm_prefetch((__m128i*)&isrc[i+16], _MM_HINT_NTA);
        _mm_prefetch((__m128i*)&isrc[i+20], _MM_HINT_NTA);
        _mm_prefetch((__m128i*)&isrc[i+24], _MM_HINT_NTA);
        _mm_prefetch((__m128i*)&isrc[i+28], _MM_HINT_NTA);

        _mm_stream_si128((__m128i*)&idst[i+0],  l0);
        _mm_stream_si128((__m128i*)&idst[i+4],  l1);
        _mm_stream_si128((__m128i*)&idst[i+8],  l2);
        _mm_stream_si128((__m128i*)&idst[i+12], l3);

    }

    for(i=ilen_sm; i<ilen; i++)
    {
        idst[i] = isrc[i];
    }

    for(i=(4*ilen); i<nbytes; i++)
    {
        cdst[i] = csrc[i];
    }
}
void __stdcall
planar_shader_to_rgb32_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const uint8_t* sr = srcp[0];
    const uint8_t* sg = srcp[1];
    const uint8_t* sb = srcp[2];
    uint8_t* d = dstp[0] + (height - 1) * dpitch;

    float* bb = reinterpret_cast<float*>(_buff);
    float* bg = bb + ((width + 7) & ~7); // must be aligned 32 bytes
    float* br = bg + ((width + 7) & ~7); // must be aligned 32 bytes

    const __m128 coef = _mm_set1_ps(255.0f);
    const __m128i zero = _mm_setzero_si128();

    for (int y = 0; y < height; ++y) {
        convert_half_to_float(br, sr, width);
        convert_half_to_float(bg, sg, width);
        convert_half_to_float(bb, sb, width);
        for (int x = 0; x < width; x += 4) {
            __m128i b = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bb + x)));
            __m128i g = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bg + x)));
            __m128i r = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(br + x)));
            __m128i bgra = _mm_or_si128(b, _mm_slli_si128(g, 1));
            bgra = _mm_or_si128(bgra, _mm_slli_si128(r, 2));
            _mm_stream_si128(reinterpret_cast<__m128i*>(d + x * 4), bgra);
        }
        sr += spitch;
        sg += spitch;
        sb += spitch;
        d -= dpitch;
    }
}
Example #8
0
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);

		t0 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00));
		t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002));
		t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260));
		t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5));
		t2 = _mm_unpackhi_epi16(t0, t0);
		t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00));
		t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002));
		t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260));
		t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5));
		t1 = _mm_packus_epi16(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Example #9
0
void* streamucpy(void* dest, const void* usrc, size_t n)
{
  char* dst       = (char*) dest;
  const char* src = (const char*) usrc;
  
  // copy up to 15 bytes until SSE-aligned
  while (((intptr_t) dst & (SSE_SIZE-1)) && n)
  {
    *dst++ = *src++; n--;
  }
  // copy SSE-aligned
  while (n >= SSE_SIZE)
  {
    __m128i data = _mm_loadu_si128((__m128i*) src);
    _mm_stream_si128((__m128i*) dst, data);
    
    dst  += SSE_SIZE;
    src += SSE_SIZE;
    
    n -= SSE_SIZE;
  }
  // copy remainder
  while (n--)
  {
    *dst++ = *src++;
  }
  return dst;
}
Example #10
0
bool opt_copy_stream_to_stream( x42memStream_t *dest, x42memStream_t *src,
	size_t elemSize, uint numElems, x42opts_t *opts )
{
	REF_PARAM( opts );
	
	if( (opts->caps & OPT_SSE2) &&
		elemSize <= sizeof( __m128 ) &&
		stream_is_aligned( src ) && stream_pad_ok( src, elemSize ) &&
		stream_is_aligned( dest ) && stream_pad_ok( dest, elemSize ) )
	{ 
		uint i; 

		size_t is = src->stride; 
		size_t os = dest->stride; 

		const __m128i * RESTRICT pi = (__m128i*)src->pStreamZero; 
		__m128i * RESTRICT po = (__m128i*)dest->pStreamZero; 

		for( i = 0; i < numElems; i++ ) 
		{ 
			__m128i v = _mm_load_si128( pi ); 
			_mm_stream_si128( po, v ); 

			pi = (__m128i*)((byte*)pi + is); 
			po = (__m128i*)((byte*)po + os); 
		} 

		_mm_sfence(); 

		return true;
	}
void __stdcall
packed_shader_to_rgb32_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const uint8_t* s = srcp[0] + (height - 1) * spitch;
    uint8_t* d = dstp[0];
    float* buff = reinterpret_cast<float*>(_buff);

    const __m128 coef = _mm_set1_ps(255.0f);
    const __m128i order = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);

    for (int y = 0; y < height; ++y) {
        convert_half_to_float(buff, s, width * 4);
        for (int x = 0; x < width; x += 4) {
            __m128i d0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 0)));
            __m128i d1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 4)));
            __m128i d2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 8)));
            __m128i d3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 12)));
            d0 = _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3));
            _mm_stream_si128(reinterpret_cast<__m128i*>(d + 4 * x), _mm_shuffle_epi8(d0, order));
        }
        d += dpitch;
        s -= spitch;
    }
}
Example #12
0
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)
{
	static const u32 PSD = 64;
	
	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
	static const __m128i round = _mm_set1_epi16(128);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % STRIDE == 0);
	assert(alpha >= 0.0 && alpha <= 1.0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i* dest128 = reinterpret_cast<__m128i*>(dest);

	__m128i s = _mm_setzero_si128();
	__m128i d = _mm_setzero_si128();
	const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));
	
	__m128i drb, dga, srb, sga;
	
	for (size_t k = 0, length = size/STRIDE; k < length; ++k)
	{		
		_mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA);	
		_mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA);
		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// r = d + (s-d)*alpha/256
			s = _mm_load_si128(source128_1);	// AABBGGRR
			d = _mm_load_si128(source128_2);	// AABBGGRR

			srb = _mm_and_si128(lomask, s);		// 00BB00RR		// unpack
			sga = _mm_srli_epi16(s, 8);			// AA00GG00		// unpack
			
			drb = _mm_and_si128(lomask, d);		// 00BB00RR		// unpack
			dga = _mm_srli_epi16(d, 8);			// AA00GG00		// unpack

			srb = _mm_sub_epi16(srb, drb);		// BBBBRRRR		// sub
			srb = _mm_mullo_epi16(srb, a);		// BBBBRRRR		// mul
			srb = _mm_add_epi16(srb, round);
			
			sga = _mm_sub_epi16(sga, dga);		// AAAAGGGG		// sub
			sga = _mm_mullo_epi16(sga, a);		// AAAAGGGG		// mul
			sga = _mm_add_epi16(sga, round);

			srb = _mm_srli_epi16(srb, 8);		// 00BB00RR		// prepack and div
			sga = _mm_andnot_si128(lomask, sga);// AA00GG00		// prepack and div

			srb = _mm_or_si128(srb, sga);		// AABBGGRR		// pack

			srb = _mm_add_epi8(srb, d);			// AABBGGRR		// add		there is no overflow(R.N)

			_mm_stream_si128(dest128, srb);
		}
	}
	_mm_mfence();	//ensure last WC buffers get flushed to memory
}
Example #13
0
void test_mm_stream_si128(__m128i *A, __m128i B) {
  // DAG-LABEL: test_mm_stream_si128
  // DAG: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16, !nontemporal
  //
  // ASM-LABEL: test_mm_stream_si128
  // ASM: movntdq
  _mm_stream_si128(A, B);
}
static __forceinline void
convert_float_to_half(uint8_t* dstp, const float* srcp, size_t count)
{
    for (size_t x = 0; x < count; x += 8) {
        __m256 s = _mm256_load_ps(srcp + x);
        __m128i d = _mm256_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
        _mm_stream_si128(reinterpret_cast<__m128i*>(dstp + 2 * x), d);
    }
}
Example #15
0
void Shuffle_SSSE3(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)
{
	static const unsigned int PSD = 64;	

	assert(source != NULL && dest != NULL);
	assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4 && "Invalid mask");
	assert(size % STRIDE == 0);

	const __m128i* source128 = reinterpret_cast<const __m128i*>(source);
	__m128i*	   dest128 = reinterpret_cast<__m128i*>(dest);	

	__m128i reg0 = _mm_setzero_si128();	
	__m128i reg1 = _mm_setzero_si128();	
	__m128i reg2 = _mm_setzero_si128();	
	__m128i reg3 = _mm_setzero_si128();	

	const __m128i mask128 = _mm_set_epi8(alpha+12, blue+12, green+12, red+12, alpha+8, blue+8, green+8, red+8, alpha+4, blue+4, green+4, red+4, alpha, blue, green, red);

	for(size_t k = 0, length = size/STRIDE; k < length; ++k)	
	{
		// TODO: put prefetch between calculations?(R.N)
		_mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);

		// work on entire cacheline before next prefetch

		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		reg0 = _mm_load_si128(source128++);	
		reg1 = _mm_load_si128(source128++);	

		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg0, mask128));

		reg2 = _mm_load_si128(source128++);	

		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg1, mask128));

		reg3 = _mm_load_si128(source128++);	
		
		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg2, mask128));	
		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg3, mask128));		
	}
	_mm_mfence();	//ensure last WC buffers get flushed to memory
}
Example #16
0
static inline char* stream_fill(char* dst, size_t* n, const __m128i data)
{
  while (*n >= SSE_SIZE)
  {
    _mm_stream_si128((__m128i*) dst, data);
    
    dst += SSE_SIZE;
    *n  -= SSE_SIZE;
  }
  return dst;
}
Example #17
0
// TODO: (R.N) optimize => prefetch and cacheline loop unroll
void Clear_SSE2(void* dest, size_t size)
{
	__m128i val = _mm_setzero_si128();
	__m128i* ptr = reinterpret_cast<__m128i*>(dest);

	int times = size / 16;
	for(int i=0; i < times; ++i) {
		_mm_stream_si128(ptr, val);
		ptr++;
	}
}
static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size)
{
	__m128i *dqdest = (__m128i*)dest;
	const __m128i *dqsrc  = (const __m128i*)src;
	const __m128i *end  = (const __m128i*)((const char*)src + size);

	do {
		_mm_prefetch(dqsrc + 4, _MM_HINT_NTA);

		__m128i xmm0 = _mm_load_si128(dqsrc + 0);
		__m128i xmm1 = _mm_load_si128(dqsrc + 1);
		__m128i xmm2 = _mm_load_si128(dqsrc + 2);
		__m128i xmm3 = _mm_load_si128(dqsrc + 3);
		dqsrc  += 4;
		_mm_stream_si128(dqdest + 0, xmm0);
		_mm_stream_si128(dqdest + 1, xmm1);
		_mm_stream_si128(dqdest + 2, xmm2);
		_mm_stream_si128(dqdest	+ 3, xmm3);
		dqdest += 4;
	} while (dqsrc != end);
}
static inline void
planar_shader_to_yuv_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const __m128 coef = _mm_set1_ps(STACK16 ? 65535.0f : 255.0f);
    const __m128i mask16 = _mm_set1_epi16(0x00FF);
    float* buff = reinterpret_cast<float*>(_buff);

    for (int p = 0; p < 3; ++p) {
        const uint8_t* s = srcp[p];
        uint8_t* d = dstp[p];
        uint8_t* lsb = d + height * dpitch;

        for (int y = 0; y < height; ++y) {
            convert_half_to_float(buff, s, width);
            for (int x = 0; x < width; x += 16) {
                __m128i s0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 0)));
                __m128i s1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 4)));
                __m128i s2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 8)));
                __m128i s3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 12)));
                s0 = _mm_packus_epi32(s0, s1);
                s1 = _mm_packus_epi32(s2, s3);
                if (!STACK16) {
                    s0 = _mm_packus_epi16(s0, s1);
                    _mm_stream_si128(reinterpret_cast<__m128i*>(d + x), s0);
                } else {
                    __m128i dm = _mm_packus_epi16(_mm_srli_epi16(s0, 8), _mm_srli_epi16(s1, 8));
                    __m128i dl = _mm_packus_epi16(_mm_and_si128(s0, mask16), _mm_and_si128(s1, mask16));
                    _mm_stream_si128(reinterpret_cast<__m128i*>(d + x), dm);
                    _mm_stream_si128(reinterpret_cast<__m128i*>(lsb + x), dl);
                }
            }
            s += spitch;
            d += dpitch;
            if (STACK16) {
                lsb += dpitch;
            }
        }
    }
}
Example #20
0
void unpack_a8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]);

		t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0);
		t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0);

		_mm_stream_si128((__m128i*)&dest[i * 16], t0);
	}
}
Example #21
0
void unpack_l8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]);

		t0 = _mm_unpacklo_epi8(t0, t0);
		t0 = _mm_unpacklo_epi16(t0, t0);
		t0 = _mm_or_si128(t0, _mm_set1_epi32(0xFF000000));

		_mm_stream_si128((__m128i*)&dest[i * 16], t0);
	}
}
Example #22
0
void unpack_la8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);

		t1 = _mm_unpacklo_epi8(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set1_epi32(0x0000FFFF));
		t2 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0);
		t1 = _mm_or_si128(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
inline void modify2x264StyleDepth(uint16_t *lp,const int lshift)
{
	
	
	if(SIMDWIDTH==8)
	{
		auto xmm0 = _mm_loadu_si128((__m128i*)lp);
		auto xmm1 = _mm_slli_epi16(xmm0,lshift);
		_mm_stream_si128((__m128i*)lp,xmm1);
	}
	else
	{
		for(int i=0;i<SIMDWIDTH;i++)
		{
			lp[i] = lp[i] << lshift;
		}
	}
}
Example #24
0
void unpack_rgba8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 16); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source[i * 16]);

		t1 = _mm_and_si128(t0, _mm_set1_epi16(0x00FF));
		t2 = _mm_and_si128(t0, _mm_set1_epi16(0xFF00));
		t1 = _mm_shufflelo_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1));
		t1 = _mm_shufflehi_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1));
		t1 = _mm_or_si128(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Example #25
0
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0,
	const Uint8* source1, Uint8* dest)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source0[i * 16]);
		t1 = _mm_load_si128((__m128i*)&source1[i * 16]);
		t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t2 = _mm_unpacklo_epi8(t2, t2);
		t2 = _mm_unpacklo_epi16(t2, t2);

		t3 = _mm_unpacklo_epi8(t0, t0);
		t4 = _mm_unpacklo_epi8(t1, t1);

		t5 = _mm_unpacklo_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t9 = _mm_adds_epu16(t7, t8);
		t9 = _mm_srli_epi16(t9, 8);

		t3 = _mm_unpackhi_epi8(t0, t0);
		t4 = _mm_unpackhi_epi8(t1, t1);

		t5 = _mm_unpackhi_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t10 = _mm_adds_epu16(t7, t8);
		t10 = _mm_srli_epi16(t10, 8);

		t10 = _mm_packus_epi16(t9, t10);

		_mm_stream_si128((__m128i*)&dest[i * 16], t10);
	}
}
void CL_PixelFillRenderer::clear(const CL_Colorf &color)
{
	int dest_buffer_width = colorbuffer0.size.width;
	int dest_buffer_height = colorbuffer0.size.height;
	unsigned char *dest_data = (unsigned char *) colorbuffer0.data;

	CL_Color c = color;
	unsigned int color8888 = (c.get_alpha() << 24) + (c.get_red() << 16) + (c.get_green() << 8) + c.get_blue();
	unsigned char *ptr_color8888 = (unsigned char *) &color8888;

	for (int y = find_first_line_for_core(clip_rect.top, core, num_cores); y < clip_rect.bottom; y += num_cores)
	{
		unsigned char *line = dest_data + y * dest_buffer_width * 4 + clip_rect.left * 4;
		unsigned int line_align = ((line) - ((unsigned char *) 0)) & 0xf; // A gcc safe way of obtaining an address
		int pos = 0;
		int length = clip_rect.get_width()*4;

		// Write single bytes until we are byte aligned:
		if (line_align)
		{
			int prefix_length = cl_min(length, (int) (16 - line_align));
			for (; pos < prefix_length; pos++)
				line[pos] = ptr_color8888[pos&0x3];
		}

		// Figure out how our 16 bytes should look like after we applied the alignment:
		unsigned int b0 = ptr_color8888[(pos+0)&0x3];
		unsigned int b1 = ptr_color8888[(pos+1)&0x3];
		unsigned int b2 = ptr_color8888[(pos+2)&0x3];
		unsigned int b3 = ptr_color8888[(pos+3)&0x3];
		__m128i c_sse = _mm_set1_epi32((b3<<24)+(b2<<16)+(b1<<8)+b0);

		// Fill 16 byte aligned:
		int align_length = length-pos-15;
		for (; pos < align_length; pos+=16)
			_mm_stream_si128((__m128i*)(line+pos), c_sse);

		// Fill remaining bytes:
		for (; pos < length; pos++)
			line[pos] = ptr_color8888[pos&0x3];
	}
}
Example #27
0
	void convert_le_d24x8_to_le_f32(void *dst, void *src, u32 row_length_in_texels, u32 num_rows)
	{
		const u32 num_pixels = row_length_in_texels * num_rows;
		verify(HERE), (num_pixels & 3) == 0;

		const auto num_iterations = (num_pixels >> 2);

		__m128i* dst_ptr = (__m128i*)dst;
		__m128i* src_ptr = (__m128i*)src;

		const __m128 scale_vector = _mm_set1_ps(1.f / 16777214.f);
		const __m128i mask = _mm_set1_epi32(0x00FFFFFF);
		for (u32 n = 0; n < num_iterations; ++n)
		{
			const __m128 src_vector = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_loadu_si128(src_ptr)));
			const __m128 normalized_vector = _mm_mul_ps(src_vector, scale_vector);
			_mm_stream_si128(dst_ptr, (__m128i&)normalized_vector);
			++dst_ptr;
			++src_ptr;
		}
	}
Example #28
0
void unpack_rgba4_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);
		// converts 4 bit values to 8 bit values (multiply with 17)
		t0 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00));
		t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010));
		t1 = _mm_mulhi_epu16(t1, _mm_set1_epi16(0x0110));
		t2 = _mm_unpackhi_epi16(t0, t0);
		t2 = _mm_and_si128(t2, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00));
		t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010));
		t2 = _mm_mulhi_epu16(t2, _mm_set1_epi16(0x0110));
		t1 = _mm_packus_epi16(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Example #29
0
void merge() {
#if defined(SSE_MERGE) || defined(SSE_MERGE_UNROLL)
  __m128i isTrue = _mm_set1_epi16(0xFFFF);
#endif

  for (int i = 0; i < NUM_PAGES; ++i) {
    //merge in everything thats different between the ref and the latest committed page (that we haven't touched)
    
#ifdef PREFETCH
    for (int pages = 1; pages <= PREFETCH_PAGES; pages++) {
      for (int bpp = 0; bpp < PREFETCH_BYTES_PER_PAGE; bpp++) {
        __builtin_prefetch( &LATEST[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ );
        __builtin_prefetch( &REF[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ );
	// don't prefetch LOCAL since we generally don't need it
        //__builtin_prefetch( &LOCAL[i+pages][bpp], 1/*write*/, 3/*high temporal locality*/ );
      }
    }
#endif

#ifdef BYTE_MERGE
    const char* latest = LATEST[i];
    const char* ref = REF[i];
    char* local = LOCAL[i];
    for (int j = 0; j < PAGE_SIZE; ++j) {
      if ( unlikely(latest[j]!=ref[j] && local[j]==ref[j]) ){
        local[j] = latest[j];
      }
    }
#endif
#ifdef WORD_MERGE
    const uint64_t* latest = (const uint64_t*) LATEST[i];
    const uint64_t* ref = (const uint64_t*) REF[i];
    uint64_t* local = (uint64_t*) LOCAL[i];

    for (int j = 0; j < (PAGE_SIZE/sizeof(uint64_t)); ++j) {

      // check for diff at word granularity first
      if ( unlikely(latest[j]!=ref[j]) ) {
        if ( local[j] == ref[j] ) {
          local[j] = latest[j];

        } else {
          // have to do byte-wise comparison
          const char* latestChar = (const char*) latest[j];
          const char* refChar = (const char*) ref[j];
          char* localChar = (char*) local[j];
          for ( int k = 0; k < sizeof(uint64_t); k++ ) {
            if ( latestChar[k] != refChar[k] && localChar[k] == refChar[k] ) {
              localChar[k] = latestChar[k];
            }
          }
        }
      }

    }
#endif
#ifdef SSE_MERGE 
    const char* latestP = LATEST[i];
    const char* refP = REF[i];
    char* localP = LOCAL[i];

    for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) {
      __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) );
      __m128i ref = _mm_load_si128( (__m128i*) (refP+j) );
      __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }
    }
#endif
#ifdef SSE_MERGE_NOBRANCH
    for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) {
      __m128i latest = _mm_load_si128( (__m128i*) &LATEST[i][j] );
      __m128i ref = _mm_load_si128( (__m128i*) &REF[i][j] );
      __m128i local = _mm_load_si128( (__m128i*) &LOCAL[i][j] );
      __m128i latref = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones
      __m128i tmp = _mm_cmpeq_epi8(local, ref);
      latref = _mm_andnot_si128( latref, tmp ); // (~latref) & localref
      // update = (latref & latest) | (~latref & local);
      tmp = _mm_and_si128(latref, latest);
      __m128i localBytes = _mm_andnot_si128(latref, local);
      tmp = _mm_or_si128(tmp, localBytes);
      _mm_stream_si128( (__m128i*) &LOCAL[i][j], tmp );
    }
#endif
#ifdef SSE_MERGE_UNROLL
    // manually unroll this loop since gcc won't do it; ugh
    const char* latestP = LATEST[i];
    const char* refP = REF[i];
    char* localP = LOCAL[i];

    for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) {
      __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) );
      __m128i ref = _mm_load_si128( (__m128i*) (refP+j) );
      __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }

      j += sizeof(__m128i);
      latest = _mm_load_si128( (__m128i*) (latestP+j) );
      ref = _mm_load_si128( (__m128i*) (refP+j) );
      latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }

      j += sizeof(__m128i);
      latest = _mm_load_si128( (__m128i*) (latestP+j) );
      ref = _mm_load_si128( (__m128i*) (refP+j) );
      latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones

      if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) {
        // some bytes differ
	__m128i local = _mm_load_si128( (__m128i*) (localP+j) );
        __m128i localEqRef = _mm_cmpeq_epi8(local, ref);
        if ( _mm_testc_si128(localEqRef, isTrue) ) {
          // local == ref
          _mm_stream_si128( (__m128i*) (localP+j), latest );
        } else {
          // (~latref) & localref, bytes where lat!=ref && local==ref
          __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef );
          // new = (latestMask & latest) | (~latestMask & local);
          __m128i latestBytes = _mm_and_si128(latestMask, latest);
          __m128i localBytes = _mm_andnot_si128(latestMask, local);
          latestBytes = _mm_or_si128(latestBytes, localBytes);
          _mm_stream_si128( (__m128i*) (localP+j), latestBytes );
        }
      }

    }
#endif


  }
}
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest)
{
	if(dest.empty())dest.create(src1.size(),CV_8U);

	const int imsize = (src1.size().area()/16);
	uchar* s1 = src1.data;
	uchar* s2 = src2.data;
	uchar* a = alpha.data;
	uchar* d = dest.data;

	const __m128i zero = _mm_setzero_si128();
	const __m128i amax = _mm_set1_epi8(char(255));
	int i=0;
	if(s1==d)
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
		}
	}
	else
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
			d+=16;
		}
	}

	{
		uchar* s1 = src1.data;
		uchar* s2 = src2.data;
		uchar* a = alpha.data;
		uchar* d = dest.data;
		for(int n=i*16;n<src1.size().area();n++)
		{
			d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8;
		}
	}
}