void sub_ssememcpy(void* _Dst, const void* _Src, size_t size) { assert(IS_16BYTE_ALIGNMENT(_Dst)); assert(IS_16BYTE_ALIGNMENT(_Src)); float* dst = (float*)_Dst; float* src = (float*)_Src; int loop_num = size >> 6; for (int i = 0; i < loop_num; i++) { //load 64byte data __m128 xmm0 = _mm_load_ps(src + 0); __m128 xmm1 = _mm_load_ps(src + 4); __m128 xmm2 = _mm_load_ps(src + 8); __m128 xmm3 = _mm_load_ps(src + 12); //store 64byte data //_mm_store_ps(dst + 0, xmm0); //_mm_store_ps(dst + 4, xmm1); //_mm_store_ps(dst + 8, xmm2); //_mm_store_ps(dst + 12, xmm3); _mm_stream_si128((__m128i*)(dst + 0), _mm_castps_si128(xmm0)); _mm_stream_si128((__m128i*)(dst + 4), _mm_castps_si128(xmm1)); _mm_stream_si128((__m128i*)(dst + 8), _mm_castps_si128(xmm2)); _mm_stream_si128((__m128i*)(dst + 12), _mm_castps_si128(xmm3)); dst += 16; src += 16; } memcpy(dst, src, size & 0x3F); }
void convert_le_f32_to_be_d24(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) { const u32 num_pixels = row_length_in_texels * num_rows; verify(HERE), (num_pixels & 3) == 0; const auto num_iterations = (num_pixels >> 2); __m128i* dst_ptr = (__m128i*)dst; __m128i* src_ptr = (__m128i*)src; const __m128 scale_vector = _mm_set1_ps(16777214.f); #if defined (_MSC_VER) || defined (__SSSE3__) if (LIKELY(utils::has_ssse3())) { const __m128i swap_mask = _mm_set_epi8 ( 0xF, 0xC, 0xD, 0xE, 0xB, 0x8, 0x9, 0xA, 0x7, 0x4, 0x5, 0x6, 0x3, 0x0, 0x1, 0x2 ); for (u32 n = 0; n < num_iterations; ++n) { const __m128i src_vector = _mm_loadu_si128(src_ptr); const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector)); const __m128i shuffled_vector = _mm_shuffle_epi8(result, swap_mask); _mm_stream_si128(dst_ptr, shuffled_vector); ++dst_ptr; ++src_ptr; } return; } #endif const __m128i mask1 = _mm_set1_epi32(0xFF00FF00); const __m128i mask2 = _mm_set1_epi32(0x00FF0000); const __m128i mask3 = _mm_set1_epi32(0x000000FF); for (u32 n = 0; n < num_iterations; ++n) { const __m128i src_vector = _mm_loadu_si128(src_ptr); const __m128i result = _mm_cvtps_epi32(_mm_mul_ps((__m128&)src_vector, scale_vector)); const __m128i v1 = _mm_and_si128(result, mask1); const __m128i v2 = _mm_and_si128(_mm_slli_epi32(result, 16), mask2); const __m128i v3 = _mm_and_si128(_mm_srli_epi32(result, 16), mask3); const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3); _mm_stream_si128(dst_ptr, shuffled_vector); ++dst_ptr; ++src_ptr; } }
void CopyBuffer( byte * dst, const byte * src, int numBytes ) { assert_16_byte_aligned( dst ); assert_16_byte_aligned( src ); int i = 0; for ( ; i + 128 <= numBytes; i += 128 ) { __m128i d0 = _mm_load_si128( (__m128i *)&src[i + 0*16] ); __m128i d1 = _mm_load_si128( (__m128i *)&src[i + 1*16] ); __m128i d2 = _mm_load_si128( (__m128i *)&src[i + 2*16] ); __m128i d3 = _mm_load_si128( (__m128i *)&src[i + 3*16] ); __m128i d4 = _mm_load_si128( (__m128i *)&src[i + 4*16] ); __m128i d5 = _mm_load_si128( (__m128i *)&src[i + 5*16] ); __m128i d6 = _mm_load_si128( (__m128i *)&src[i + 6*16] ); __m128i d7 = _mm_load_si128( (__m128i *)&src[i + 7*16] ); _mm_stream_si128( (__m128i *)&dst[i + 0*16], d0 ); _mm_stream_si128( (__m128i *)&dst[i + 1*16], d1 ); _mm_stream_si128( (__m128i *)&dst[i + 2*16], d2 ); _mm_stream_si128( (__m128i *)&dst[i + 3*16], d3 ); _mm_stream_si128( (__m128i *)&dst[i + 4*16], d4 ); _mm_stream_si128( (__m128i *)&dst[i + 5*16], d5 ); _mm_stream_si128( (__m128i *)&dst[i + 6*16], d6 ); _mm_stream_si128( (__m128i *)&dst[i + 7*16], d7 ); } for ( ; i + 16 <= numBytes; i += 16 ) { __m128i d = _mm_load_si128( (__m128i *)&src[i] ); _mm_stream_si128( (__m128i *)&dst[i], d ); } for ( ; i + 4 <= numBytes; i += 4 ) { *(uint32 *)&dst[i] = *(const uint32 *)&src[i]; } for ( ; i < numBytes; i++ ) { dst[i] = src[i]; } _mm_sfence(); }
void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) { size_t i = 0; #ifdef ENABLE_SSE2 const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth); const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID); const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID); const __m128i attrStencil_vec128 = _mm_set1_epi8(attr.stencil); const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged); const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly); const size_t sseCount = count - (count % 16); for (; i < sseCount; i += 16) { _mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 8), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128); _mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128); _mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128); _mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128); _mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128); _mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128); } #endif for (; i < count; i++) { this->SetAtIndex(i, attr); } }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
void simd_memcpy(void *dst, void *src, size_t nbytes) { size_t i; size_t ilen = nbytes/sizeof(int); size_t ilen_sm = ilen - ilen%16; char *cdst=(char*)dst; char *csrc=(char*)src; int * idst=(int*)dst; int * isrc=(int*)src; __m128i l0,l1,l2,l3; _mm_prefetch((__m128i*)&isrc[0], _MM_HINT_NTA); _mm_prefetch((__m128i*)&isrc[4], _MM_HINT_NTA); _mm_prefetch((__m128i*)&isrc[8], _MM_HINT_NTA); _mm_prefetch((__m128i*)&isrc[12], _MM_HINT_NTA); for(i=0; i<ilen_sm; i+=16) { l0 = _mm_load_si128((__m128i*)&isrc[i+0]); l1 = _mm_load_si128((__m128i*)&isrc[i+4]); l2 = _mm_load_si128((__m128i*)&isrc[i+8]); l3 = _mm_load_si128((__m128i*)&isrc[i+12]); _mm_prefetch((__m128i*)&isrc[i+16], _MM_HINT_NTA); _mm_prefetch((__m128i*)&isrc[i+20], _MM_HINT_NTA); _mm_prefetch((__m128i*)&isrc[i+24], _MM_HINT_NTA); _mm_prefetch((__m128i*)&isrc[i+28], _MM_HINT_NTA); _mm_stream_si128((__m128i*)&idst[i+0], l0); _mm_stream_si128((__m128i*)&idst[i+4], l1); _mm_stream_si128((__m128i*)&idst[i+8], l2); _mm_stream_si128((__m128i*)&idst[i+12], l3); } for(i=ilen_sm; i<ilen; i++) { idst[i] = isrc[i]; } for(i=(4*ilen); i<nbytes; i++) { cdst[i] = csrc[i]; } }
void __stdcall planar_shader_to_rgb32_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* sr = srcp[0]; const uint8_t* sg = srcp[1]; const uint8_t* sb = srcp[2]; uint8_t* d = dstp[0] + (height - 1) * dpitch; float* bb = reinterpret_cast<float*>(_buff); float* bg = bb + ((width + 7) & ~7); // must be aligned 32 bytes float* br = bg + ((width + 7) & ~7); // must be aligned 32 bytes const __m128 coef = _mm_set1_ps(255.0f); const __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; ++y) { convert_half_to_float(br, sr, width); convert_half_to_float(bg, sg, width); convert_half_to_float(bb, sb, width); for (int x = 0; x < width; x += 4) { __m128i b = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bb + x))); __m128i g = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(bg + x))); __m128i r = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(br + x))); __m128i bgra = _mm_or_si128(b, _mm_slli_si128(g, 1)); bgra = _mm_or_si128(bgra, _mm_slli_si128(r, 2)); _mm_stream_si128(reinterpret_cast<__m128i*>(d + x * 4), bgra); } sr += spitch; sg += spitch; sb += spitch; d -= dpitch; } }
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
void* streamucpy(void* dest, const void* usrc, size_t n) { char* dst = (char*) dest; const char* src = (const char*) usrc; // copy up to 15 bytes until SSE-aligned while (((intptr_t) dst & (SSE_SIZE-1)) && n) { *dst++ = *src++; n--; } // copy SSE-aligned while (n >= SSE_SIZE) { __m128i data = _mm_loadu_si128((__m128i*) src); _mm_stream_si128((__m128i*) dst, data); dst += SSE_SIZE; src += SSE_SIZE; n -= SSE_SIZE; } // copy remainder while (n--) { *dst++ = *src++; } return dst; }
bool opt_copy_stream_to_stream( x42memStream_t *dest, x42memStream_t *src, size_t elemSize, uint numElems, x42opts_t *opts ) { REF_PARAM( opts ); if( (opts->caps & OPT_SSE2) && elemSize <= sizeof( __m128 ) && stream_is_aligned( src ) && stream_pad_ok( src, elemSize ) && stream_is_aligned( dest ) && stream_pad_ok( dest, elemSize ) ) { uint i; size_t is = src->stride; size_t os = dest->stride; const __m128i * RESTRICT pi = (__m128i*)src->pStreamZero; __m128i * RESTRICT po = (__m128i*)dest->pStreamZero; for( i = 0; i < numElems; i++ ) { __m128i v = _mm_load_si128( pi ); _mm_stream_si128( po, v ); pi = (__m128i*)((byte*)pi + is); po = (__m128i*)((byte*)po + os); } _mm_sfence(); return true; }
void __stdcall packed_shader_to_rgb32_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* s = srcp[0] + (height - 1) * spitch; uint8_t* d = dstp[0]; float* buff = reinterpret_cast<float*>(_buff); const __m128 coef = _mm_set1_ps(255.0f); const __m128i order = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); for (int y = 0; y < height; ++y) { convert_half_to_float(buff, s, width * 4); for (int x = 0; x < width; x += 4) { __m128i d0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 0))); __m128i d1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 4))); __m128i d2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 8))); __m128i d3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 12))); d0 = _mm_packus_epi16(_mm_packs_epi32(d0, d1), _mm_packs_epi32(d2, d3)); _mm_stream_si128(reinterpret_cast<__m128i*>(d + 4 * x), _mm_shuffle_epi8(d0, order)); } d += dpitch; s -= spitch; } }
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % STRIDE == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/STRIDE; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_stream_si128(dest128, srb); } } _mm_mfence(); //ensure last WC buffers get flushed to memory }
void test_mm_stream_si128(__m128i *A, __m128i B) { // DAG-LABEL: test_mm_stream_si128 // DAG: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 16, !nontemporal // // ASM-LABEL: test_mm_stream_si128 // ASM: movntdq _mm_stream_si128(A, B); }
static __forceinline void convert_float_to_half(uint8_t* dstp, const float* srcp, size_t count) { for (size_t x = 0; x < count; x += 8) { __m256 s = _mm256_load_ps(srcp + x); __m128i d = _mm256_cvtps_ph(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_stream_si128(reinterpret_cast<__m128i*>(dstp + 2 * x), d); } }
void Shuffle_SSSE3(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha) { static const unsigned int PSD = 64; assert(source != NULL && dest != NULL); assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4 && "Invalid mask"); assert(size % STRIDE == 0); const __m128i* source128 = reinterpret_cast<const __m128i*>(source); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i reg0 = _mm_setzero_si128(); __m128i reg1 = _mm_setzero_si128(); __m128i reg2 = _mm_setzero_si128(); __m128i reg3 = _mm_setzero_si128(); const __m128i mask128 = _mm_set_epi8(alpha+12, blue+12, green+12, red+12, alpha+8, blue+8, green+8, red+8, alpha+4, blue+4, green+4, red+4, alpha, blue, green, red); for(size_t k = 0, length = size/STRIDE; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA); // work on entire cacheline before next prefetch // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ reg0 = _mm_load_si128(source128++); reg1 = _mm_load_si128(source128++); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg0, mask128)); reg2 = _mm_load_si128(source128++); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg1, mask128)); reg3 = _mm_load_si128(source128++); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg2, mask128)); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg3, mask128)); } _mm_mfence(); //ensure last WC buffers get flushed to memory }
static inline char* stream_fill(char* dst, size_t* n, const __m128i data) { while (*n >= SSE_SIZE) { _mm_stream_si128((__m128i*) dst, data); dst += SSE_SIZE; *n -= SSE_SIZE; } return dst; }
// TODO: (R.N) optimize => prefetch and cacheline loop unroll void Clear_SSE2(void* dest, size_t size) { __m128i val = _mm_setzero_si128(); __m128i* ptr = reinterpret_cast<__m128i*>(dest); int times = size / 16; for(int i=0; i < times; ++i) { _mm_stream_si128(ptr, val); ptr++; } }
static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t size) { __m128i *dqdest = (__m128i*)dest; const __m128i *dqsrc = (const __m128i*)src; const __m128i *end = (const __m128i*)((const char*)src + size); do { _mm_prefetch(dqsrc + 4, _MM_HINT_NTA); __m128i xmm0 = _mm_load_si128(dqsrc + 0); __m128i xmm1 = _mm_load_si128(dqsrc + 1); __m128i xmm2 = _mm_load_si128(dqsrc + 2); __m128i xmm3 = _mm_load_si128(dqsrc + 3); dqsrc += 4; _mm_stream_si128(dqdest + 0, xmm0); _mm_stream_si128(dqdest + 1, xmm1); _mm_stream_si128(dqdest + 2, xmm2); _mm_stream_si128(dqdest + 3, xmm3); dqdest += 4; } while (dqsrc != end); }
static inline void planar_shader_to_yuv_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const __m128 coef = _mm_set1_ps(STACK16 ? 65535.0f : 255.0f); const __m128i mask16 = _mm_set1_epi16(0x00FF); float* buff = reinterpret_cast<float*>(_buff); for (int p = 0; p < 3; ++p) { const uint8_t* s = srcp[p]; uint8_t* d = dstp[p]; uint8_t* lsb = d + height * dpitch; for (int y = 0; y < height; ++y) { convert_half_to_float(buff, s, width); for (int x = 0; x < width; x += 16) { __m128i s0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 0))); __m128i s1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 4))); __m128i s2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 8))); __m128i s3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + x + 12))); s0 = _mm_packus_epi32(s0, s1); s1 = _mm_packus_epi32(s2, s3); if (!STACK16) { s0 = _mm_packus_epi16(s0, s1); _mm_stream_si128(reinterpret_cast<__m128i*>(d + x), s0); } else { __m128i dm = _mm_packus_epi16(_mm_srli_epi16(s0, 8), _mm_srli_epi16(s1, 8)); __m128i dl = _mm_packus_epi16(_mm_and_si128(s0, mask16), _mm_and_si128(s1, mask16)); _mm_stream_si128(reinterpret_cast<__m128i*>(d + x), dm); _mm_stream_si128(reinterpret_cast<__m128i*>(lsb + x), dl); } } s += spitch; d += dpitch; if (STACK16) { lsb += dpitch; } } } }
void unpack_a8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]); t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), t0); t0 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0); _mm_stream_si128((__m128i*)&dest[i * 16], t0); } }
void unpack_l8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = (__m128i)_mm_load_ss((float*)&source[i * 4]); t0 = _mm_unpacklo_epi8(t0, t0); t0 = _mm_unpacklo_epi16(t0, t0); t0 = _mm_or_si128(t0, _mm_set1_epi32(0xFF000000)); _mm_stream_si128((__m128i*)&dest[i * 16], t0); } }
void unpack_la8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); t1 = _mm_unpacklo_epi8(t0, t0); t1 = _mm_and_si128(t1, _mm_set1_epi32(0x0000FFFF)); t2 = _mm_unpacklo_epi16(_mm_setzero_si128(), t0); t1 = _mm_or_si128(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
inline void modify2x264StyleDepth(uint16_t *lp,const int lshift) { if(SIMDWIDTH==8) { auto xmm0 = _mm_loadu_si128((__m128i*)lp); auto xmm1 = _mm_slli_epi16(xmm0,lshift); _mm_stream_si128((__m128i*)lp,xmm1); } else { for(int i=0;i<SIMDWIDTH;i++) { lp[i] = lp[i] << lshift; } } }
void unpack_rgba8_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 16); i++) { t0 = _mm_load_si128((__m128i*)&source[i * 16]); t1 = _mm_and_si128(t0, _mm_set1_epi16(0x00FF)); t2 = _mm_and_si128(t0, _mm_set1_epi16(0xFF00)); t1 = _mm_shufflelo_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1)); t1 = _mm_shufflehi_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1)); t1 = _mm_or_si128(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0, const Uint8* source1, Uint8* dest) { __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = _mm_load_si128((__m128i*)&source0[i * 16]); t1 = _mm_load_si128((__m128i*)&source1[i * 16]); t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t2 = _mm_unpacklo_epi8(t2, t2); t2 = _mm_unpacklo_epi16(t2, t2); t3 = _mm_unpacklo_epi8(t0, t0); t4 = _mm_unpacklo_epi8(t1, t1); t5 = _mm_unpacklo_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t9 = _mm_adds_epu16(t7, t8); t9 = _mm_srli_epi16(t9, 8); t3 = _mm_unpackhi_epi8(t0, t0); t4 = _mm_unpackhi_epi8(t1, t1); t5 = _mm_unpackhi_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t10 = _mm_adds_epu16(t7, t8); t10 = _mm_srli_epi16(t10, 8); t10 = _mm_packus_epi16(t9, t10); _mm_stream_si128((__m128i*)&dest[i * 16], t10); } }
void CL_PixelFillRenderer::clear(const CL_Colorf &color) { int dest_buffer_width = colorbuffer0.size.width; int dest_buffer_height = colorbuffer0.size.height; unsigned char *dest_data = (unsigned char *) colorbuffer0.data; CL_Color c = color; unsigned int color8888 = (c.get_alpha() << 24) + (c.get_red() << 16) + (c.get_green() << 8) + c.get_blue(); unsigned char *ptr_color8888 = (unsigned char *) &color8888; for (int y = find_first_line_for_core(clip_rect.top, core, num_cores); y < clip_rect.bottom; y += num_cores) { unsigned char *line = dest_data + y * dest_buffer_width * 4 + clip_rect.left * 4; unsigned int line_align = ((line) - ((unsigned char *) 0)) & 0xf; // A gcc safe way of obtaining an address int pos = 0; int length = clip_rect.get_width()*4; // Write single bytes until we are byte aligned: if (line_align) { int prefix_length = cl_min(length, (int) (16 - line_align)); for (; pos < prefix_length; pos++) line[pos] = ptr_color8888[pos&0x3]; } // Figure out how our 16 bytes should look like after we applied the alignment: unsigned int b0 = ptr_color8888[(pos+0)&0x3]; unsigned int b1 = ptr_color8888[(pos+1)&0x3]; unsigned int b2 = ptr_color8888[(pos+2)&0x3]; unsigned int b3 = ptr_color8888[(pos+3)&0x3]; __m128i c_sse = _mm_set1_epi32((b3<<24)+(b2<<16)+(b1<<8)+b0); // Fill 16 byte aligned: int align_length = length-pos-15; for (; pos < align_length; pos+=16) _mm_stream_si128((__m128i*)(line+pos), c_sse); // Fill remaining bytes: for (; pos < length; pos++) line[pos] = ptr_color8888[pos&0x3]; } }
void convert_le_d24x8_to_le_f32(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) { const u32 num_pixels = row_length_in_texels * num_rows; verify(HERE), (num_pixels & 3) == 0; const auto num_iterations = (num_pixels >> 2); __m128i* dst_ptr = (__m128i*)dst; __m128i* src_ptr = (__m128i*)src; const __m128 scale_vector = _mm_set1_ps(1.f / 16777214.f); const __m128i mask = _mm_set1_epi32(0x00FFFFFF); for (u32 n = 0; n < num_iterations; ++n) { const __m128 src_vector = _mm_cvtepi32_ps(_mm_and_si128(mask, _mm_loadu_si128(src_ptr))); const __m128 normalized_vector = _mm_mul_ps(src_vector, scale_vector); _mm_stream_si128(dst_ptr, (__m128i&)normalized_vector); ++dst_ptr; ++src_ptr; } }
void unpack_rgba4_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); // converts 4 bit values to 8 bit values (multiply with 17) t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010)); t1 = _mm_mulhi_epu16(t1, _mm_set1_epi16(0x0110)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010)); t2 = _mm_mulhi_epu16(t2, _mm_set1_epi16(0x0110)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
void merge() { #if defined(SSE_MERGE) || defined(SSE_MERGE_UNROLL) __m128i isTrue = _mm_set1_epi16(0xFFFF); #endif for (int i = 0; i < NUM_PAGES; ++i) { //merge in everything thats different between the ref and the latest committed page (that we haven't touched) #ifdef PREFETCH for (int pages = 1; pages <= PREFETCH_PAGES; pages++) { for (int bpp = 0; bpp < PREFETCH_BYTES_PER_PAGE; bpp++) { __builtin_prefetch( &LATEST[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ ); __builtin_prefetch( &REF[i+pages][bpp], 0/*read*/, 3/*high temporal locality*/ ); // don't prefetch LOCAL since we generally don't need it //__builtin_prefetch( &LOCAL[i+pages][bpp], 1/*write*/, 3/*high temporal locality*/ ); } } #endif #ifdef BYTE_MERGE const char* latest = LATEST[i]; const char* ref = REF[i]; char* local = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; ++j) { if ( unlikely(latest[j]!=ref[j] && local[j]==ref[j]) ){ local[j] = latest[j]; } } #endif #ifdef WORD_MERGE const uint64_t* latest = (const uint64_t*) LATEST[i]; const uint64_t* ref = (const uint64_t*) REF[i]; uint64_t* local = (uint64_t*) LOCAL[i]; for (int j = 0; j < (PAGE_SIZE/sizeof(uint64_t)); ++j) { // check for diff at word granularity first if ( unlikely(latest[j]!=ref[j]) ) { if ( local[j] == ref[j] ) { local[j] = latest[j]; } else { // have to do byte-wise comparison const char* latestChar = (const char*) latest[j]; const char* refChar = (const char*) ref[j]; char* localChar = (char*) local[j]; for ( int k = 0; k < sizeof(uint64_t); k++ ) { if ( latestChar[k] != refChar[k] && localChar[k] == refChar[k] ) { localChar[k] = latestChar[k]; } } } } } #endif #ifdef SSE_MERGE const char* latestP = LATEST[i]; const char* refP = REF[i]; char* localP = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) ); __m128i ref = _mm_load_si128( (__m128i*) (refP+j) ); __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } } #endif #ifdef SSE_MERGE_NOBRANCH for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) &LATEST[i][j] ); __m128i ref = _mm_load_si128( (__m128i*) &REF[i][j] ); __m128i local = _mm_load_si128( (__m128i*) &LOCAL[i][j] ); __m128i latref = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones __m128i tmp = _mm_cmpeq_epi8(local, ref); latref = _mm_andnot_si128( latref, tmp ); // (~latref) & localref // update = (latref & latest) | (~latref & local); tmp = _mm_and_si128(latref, latest); __m128i localBytes = _mm_andnot_si128(latref, local); tmp = _mm_or_si128(tmp, localBytes); _mm_stream_si128( (__m128i*) &LOCAL[i][j], tmp ); } #endif #ifdef SSE_MERGE_UNROLL // manually unroll this loop since gcc won't do it; ugh const char* latestP = LATEST[i]; const char* refP = REF[i]; char* localP = LOCAL[i]; for (int j = 0; j < PAGE_SIZE; j += sizeof(__m128i)) { __m128i latest = _mm_load_si128( (__m128i*) (latestP+j) ); __m128i ref = _mm_load_si128( (__m128i*) (refP+j) ); __m128i latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } j += sizeof(__m128i); latest = _mm_load_si128( (__m128i*) (latestP+j) ); ref = _mm_load_si128( (__m128i*) (refP+j) ); latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } j += sizeof(__m128i); latest = _mm_load_si128( (__m128i*) (latestP+j) ); ref = _mm_load_si128( (__m128i*) (refP+j) ); latEqRef = _mm_cmpeq_epi8(latest, ref); // if latest == ref, latref is all ones if ( unlikely(!_mm_testc_si128(latEqRef, isTrue)) ) { // some bytes differ __m128i local = _mm_load_si128( (__m128i*) (localP+j) ); __m128i localEqRef = _mm_cmpeq_epi8(local, ref); if ( _mm_testc_si128(localEqRef, isTrue) ) { // local == ref _mm_stream_si128( (__m128i*) (localP+j), latest ); } else { // (~latref) & localref, bytes where lat!=ref && local==ref __m128i latestMask = _mm_andnot_si128( latEqRef, localEqRef ); // new = (latestMask & latest) | (~latestMask & local); __m128i latestBytes = _mm_and_si128(latestMask, latest); __m128i localBytes = _mm_andnot_si128(latestMask, local); latestBytes = _mm_or_si128(latestBytes, localBytes); _mm_stream_si128( (__m128i*) (localP+j), latestBytes ); } } } #endif } }
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest) { if(dest.empty())dest.create(src1.size(),CV_8U); const int imsize = (src1.size().area()/16); uchar* s1 = src1.data; uchar* s2 = src2.data; uchar* a = alpha.data; uchar* d = dest.data; const __m128i zero = _mm_setzero_si128(); const __m128i amax = _mm_set1_epi8(char(255)); int i=0; if(s1==d) { for(;i<imsize;++i) { __m128i ms1h = _mm_load_si128((__m128i*)(s1)); __m128i ms2h = _mm_load_si128((__m128i*)(s2)); __m128i mah = _mm_load_si128((__m128i*)(a)); __m128i imah = _mm_sub_epi8(amax,mah); __m128i ms1l = _mm_unpacklo_epi8(ms1h, zero); ms1h = _mm_unpackhi_epi8(ms1h, zero); __m128i ms2l = _mm_unpacklo_epi8(ms2h, zero); ms2h = _mm_unpackhi_epi8(ms2h, zero); __m128i mal = _mm_unpacklo_epi8(mah, zero); mah = _mm_unpackhi_epi8(mah, zero); __m128i imal = _mm_unpacklo_epi8(imah, zero); imah = _mm_unpackhi_epi8(imah, zero); ms1l = _mm_mullo_epi16(ms1l,mal); ms2l = _mm_mullo_epi16(ms2l,imal); ms1l = _mm_add_epi16(ms1l,ms2l); //ms1l = _mm_srli_epi16(ms1l,8); ms1l = _mm_srai_epi16(ms1l,8); ms1h = _mm_mullo_epi16(ms1h,mah); ms2h = _mm_mullo_epi16(ms2h,imah); ms1h = _mm_add_epi16(ms1h,ms2h); //ms1h = _mm_srli_epi16(ms1h,8); ms1h = _mm_srai_epi16(ms1h,8); _mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h)); s1+=16; s2+=16; a+=16; } } else { for(;i<imsize;++i) { __m128i ms1h = _mm_load_si128((__m128i*)(s1)); __m128i ms2h = _mm_load_si128((__m128i*)(s2)); __m128i mah = _mm_load_si128((__m128i*)(a)); __m128i imah = _mm_sub_epi8(amax,mah); __m128i ms1l = _mm_unpacklo_epi8(ms1h, zero); ms1h = _mm_unpackhi_epi8(ms1h, zero); __m128i ms2l = _mm_unpacklo_epi8(ms2h, zero); ms2h = _mm_unpackhi_epi8(ms2h, zero); __m128i mal = _mm_unpacklo_epi8(mah, zero); mah = _mm_unpackhi_epi8(mah, zero); __m128i imal = _mm_unpacklo_epi8(imah, zero); imah = _mm_unpackhi_epi8(imah, zero); ms1l = _mm_mullo_epi16(ms1l,mal); ms2l = _mm_mullo_epi16(ms2l,imal); ms1l = _mm_add_epi16(ms1l,ms2l); //ms1l = _mm_srli_epi16(ms1l,8); ms1l = _mm_srai_epi16(ms1l,8); ms1h = _mm_mullo_epi16(ms1h,mah); ms2h = _mm_mullo_epi16(ms2h,imah); ms1h = _mm_add_epi16(ms1h,ms2h); //ms1h = _mm_srli_epi16(ms1h,8); ms1h = _mm_srai_epi16(ms1h,8); _mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h)); s1+=16; s2+=16; a+=16; d+=16; } } { uchar* s1 = src1.data; uchar* s2 = src2.data; uchar* a = alpha.data; uchar* d = dest.data; for(int n=i*16;n<src1.size().area();n++) { d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8; } } }