static void sse4_1_test (void) { union { __m128i x[NUM]; int i[NUM * 4]; } dst, src; int i; init_movntdqa (src.i); for (i = 0; i < NUM; i++) dst.x[i] = _mm_stream_load_si128 (&src.x[i]); for (i = 0; i < NUM; i++) if (memcmp (&dst.x[i], &src.x[i], sizeof(src.x[i]))) abort (); }
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch) { #if QTAV_HAVE(SSE4_1) //assert(((intptr_t)pCacheBlock & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); __m128i x0, x1, x2, x3; __m128i *pLoad; __m128i *pStore; __m128i *pCache; UINT x, y, yLoad, yStore; UINT rowsPerBlock; UINT width64; UINT extraPitch; rowsPerBlock = CACHED_BUFFER_SIZE / pitch; width64 = (width + 63) & ~0x03f; extraPitch = (pitch - width64) / 16; pLoad = (__m128i *)pSrc; pStore = (__m128i *)pDest; const bool src_unaligned = ((intptr_t)pSrc) & 0x0f; const bool dst_unaligned = ((intptr_t)pDest & 0x0f); //if (src_unaligned || dst_unaligned) // qDebug("===========unaligned: src %d, dst: %d, extraPitch: %d", src_unaligned, dst_unaligned, extraPitch); // COPY THROUGH 4KB CACHED BUFFER for (y = 0; y < height; y += rowsPerBlock) { // ROWS LEFT TO COPY AT END if (y + rowsPerBlock > height) rowsPerBlock = height - y; pCache = (__m128i *)pCacheBlock; _mm_mfence(); // LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) { // COPY A ROW, CACHE LINE AT A TIME for (x = 0; x < pitch; x +=64) { // movntdqa x0 = _mm_stream_load_si128(pLoad + 0); x1 = _mm_stream_load_si128(pLoad + 1); x2 = _mm_stream_load_si128(pLoad + 2); x3 = _mm_stream_load_si128(pLoad + 3); if (src_unaligned) { // movdqu _mm_storeu_si128(pCache +0, x0); _mm_storeu_si128(pCache +1, x1); _mm_storeu_si128(pCache +2, x2); _mm_storeu_si128(pCache +3, x3); } else { // movdqa _mm_store_si128(pCache +0, x0); _mm_store_si128(pCache +1, x1); _mm_store_si128(pCache +2, x2); _mm_store_si128(pCache +3, x3); } pCache += 4; pLoad += 4; } } _mm_mfence(); pCache = (__m128i *)pCacheBlock; // STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK for (yStore = 0; yStore < rowsPerBlock; yStore++) { // copy a row, cache line at a time for (x = 0; x < width64; x += 64) { // movdqa x0 = _mm_load_si128(pCache); x1 = _mm_load_si128(pCache + 1); x2 = _mm_load_si128(pCache + 2); x3 = _mm_load_si128(pCache + 3); if (dst_unaligned) { // movdqu _mm_storeu_si128(pStore, x0); _mm_storeu_si128(pStore + 1, x1); _mm_storeu_si128(pStore + 2, x2); _mm_storeu_si128(pStore + 3, x3); } else { // movntdq _mm_stream_si128(pStore, x0); _mm_stream_si128(pStore + 1, x1); _mm_stream_si128(pStore + 2, x2); _mm_stream_si128(pStore + 3, x3); } pCache += 4; pStore += 4; } pCache += extraPitch; pStore += extraPitch; } } #else Q_UNUSED(pSrc); Q_UNUSED(pDest); Q_UNUSED(pCacheBlock); Q_UNUSED(width); Q_UNUSED(height); Q_UNUSED(pitch); #endif //QTAV_HAVE(SSE4_1) }
// src is WC MMIO of GPU BAR // dest is host memory int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes) { int ret = 0; #ifdef __SSE4_1__ char *d = (char*)dest; uintptr_t d_int = (uintptr_t)d; const char *s = (const char *)src; uintptr_t s_int = (uintptr_t)s; size_t n = n_bytes; // align src to 128-bits if (s_int & 0xf) { size_t nh = min(0x10 - (s_int & 0x0f), n); memcpy(d, s, nh); d += nh; d_int += nh; s += nh; s_int += nh; n -= nh; } if (d_int & 0xf) { // dest is not aligned to 128-bits __m128i r0,r1,r2,r3,r4,r5,r6,r7; // unroll 8 while (n >= 8*sizeof(__m128i)) { r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i))); r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i))); r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i))); r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i))); r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i))); r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i))); r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i))); r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i))); _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0); _mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1); _mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2); _mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3); _mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4); _mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5); _mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6); _mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7); s += 8*sizeof(__m128i); d += 8*sizeof(__m128i); n -= 8*sizeof(__m128i); } while (n >= sizeof(__m128i)) { r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i))); _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0); s += sizeof(__m128i); d += sizeof(__m128i); n -= sizeof(__m128i); } } else { // or it IS aligned __m128i r0,r1,r2,r3,r4,r5,r6,r7; // unroll 8 while (n >= 8*sizeof(__m128i)) { r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i))); r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i))); r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i))); r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i))); r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i))); r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i))); r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i))); r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i))); _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0); _mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1); _mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2); _mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3); _mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4); _mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5); _mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6); _mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7); s += 8*sizeof(__m128i); d += 8*sizeof(__m128i); n -= 8*sizeof(__m128i); } while (n >= sizeof(__m128i)) { r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i))); _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0); s += sizeof(__m128i); d += sizeof(__m128i); n -= sizeof(__m128i); } } if (n) memcpy(d, s, n); // fencing because of NT stores // potential optimization: issue only when NT stores are actually emitted _mm_sfence(); #else #error "this file should be compiled with -msse4.1" #endif return ret; }
__m128i test_mm_stream_load_si128(__m128i *a) { // CHECK-LABEL: test_mm_stream_load_si128 // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa // CHECK-ASM: movntdqa return _mm_stream_load_si128(a); }
__m128i test_mm_stream_load_si128(__m128i const *a) { // CHECK-LABEL: test_mm_stream_load_si128 // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16, !nontemporal return _mm_stream_load_si128(a); }
void CopyFrame(void * pSrc, void * pDest, void * pCacheBlock, UINT width, UINT height, UINT pitch) { __m128i x0, x1, x2, x3; __m128i *pLoad; __m128i *pStore; __m128i *pCache; UINT x, y, yLoad, yStore; UINT rowsPerBlock; UINT width64; UINT extraPitch; rowsPerBlock = CACHED_BUFFER_SIZE / pitch; width64 = (width + 63) & ~0x03f; extraPitch = (pitch - width64) / 16; pLoad = (__m128i *)pSrc; pStore = (__m128i *)pDest; // COPY THROUGH 4KB CACHED BUFFER for (y = 0; y < height; y += rowsPerBlock) { // ROWS LEFT TO COPY AT END if (y + rowsPerBlock > height) rowsPerBlock = height - y; pCache = (__m128i *)pCacheBlock; _mm_mfence(); // LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) { // COPY A ROW, CACHE LINE AT A TIME for (x = 0; x < pitch; x += 64) { x0 = _mm_stream_load_si128(pLoad + 0); x1 = _mm_stream_load_si128(pLoad + 1); x2 = _mm_stream_load_si128(pLoad + 2); x3 = _mm_stream_load_si128(pLoad + 3); _mm_store_si128(pCache + 0, x0); _mm_store_si128(pCache + 1, x1); _mm_store_si128(pCache + 2, x2); _mm_store_si128(pCache + 3, x3); pCache += 4; pLoad += 4; } } _mm_mfence(); pCache = (__m128i *)pCacheBlock; // STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK for (yStore = 0; yStore < rowsPerBlock; yStore++) { // copy a row, cache line at a time for (x = 0; x < width64; x += 64) { x0 = _mm_load_si128(pCache); x1 = _mm_load_si128(pCache + 1); x2 = _mm_load_si128(pCache + 2); x3 = _mm_load_si128(pCache + 3); _mm_stream_si128(pStore, x0); _mm_stream_si128(pStore + 1, x1); _mm_stream_si128(pStore + 2, x2); _mm_stream_si128(pStore + 3, x3); pCache += 4; pStore += 4; } pCache += extraPitch; pStore += extraPitch; } } }