Пример #1
0
static void
sse4_1_test (void)
{
  union
    {
      __m128i x[NUM];
      int i[NUM * 4];
    } dst, src;
  int i;

  init_movntdqa (src.i);

  for (i = 0; i < NUM; i++)
    dst.x[i] = _mm_stream_load_si128 (&src.x[i]);

  for (i = 0; i < NUM; i++)
    if (memcmp (&dst.x[i], &src.x[i], sizeof(src.x[i])))
      abort ();
}
Пример #2
0
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch)
{
#if QTAV_HAVE(SSE4_1)
    //assert(((intptr_t)pCacheBlock & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
    __m128i		x0, x1, x2, x3;
    __m128i		*pLoad;
    __m128i		*pStore;
    __m128i		*pCache;
    UINT		x, y, yLoad, yStore;
    UINT		rowsPerBlock;
    UINT		width64;
    UINT		extraPitch;

    rowsPerBlock = CACHED_BUFFER_SIZE / pitch;
    width64 = (width + 63) & ~0x03f;
    extraPitch = (pitch - width64) / 16;

    pLoad  = (__m128i *)pSrc;
    pStore = (__m128i *)pDest;

    const bool src_unaligned = ((intptr_t)pSrc) & 0x0f;
    const bool dst_unaligned = ((intptr_t)pDest & 0x0f);
    //if (src_unaligned || dst_unaligned)
      //  qDebug("===========unaligned: src %d, dst: %d,  extraPitch: %d", src_unaligned, dst_unaligned, extraPitch);
    //  COPY THROUGH 4KB CACHED BUFFER
    for (y = 0; y < height; y += rowsPerBlock) {
        //  ROWS LEFT TO COPY AT END
        if (y + rowsPerBlock > height)
            rowsPerBlock = height - y;

        pCache = (__m128i *)pCacheBlock;

        _mm_mfence();

        // LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK
        for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) {
            // COPY A ROW, CACHE LINE AT A TIME
            for (x = 0; x < pitch; x +=64) {
                // movntdqa
                x0 = _mm_stream_load_si128(pLoad + 0);
                x1 = _mm_stream_load_si128(pLoad + 1);
                x2 = _mm_stream_load_si128(pLoad + 2);
                x3 = _mm_stream_load_si128(pLoad + 3);

                if (src_unaligned) {
                    // movdqu
                    _mm_storeu_si128(pCache +0, x0);
                    _mm_storeu_si128(pCache +1, x1);
                    _mm_storeu_si128(pCache +2, x2);
                    _mm_storeu_si128(pCache +3, x3);
                } else {
                    // movdqa
                    _mm_store_si128(pCache +0, x0);
                    _mm_store_si128(pCache +1, x1);
                    _mm_store_si128(pCache +2, x2);
                    _mm_store_si128(pCache +3, x3);
                }
                pCache += 4;
                pLoad += 4;
            }
        }

        _mm_mfence();

        pCache = (__m128i *)pCacheBlock;
        // STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK
        for (yStore = 0; yStore < rowsPerBlock; yStore++) {
            // copy a row, cache line at a time
            for (x = 0; x < width64; x += 64) {
                // movdqa
                x0 = _mm_load_si128(pCache);
                x1 = _mm_load_si128(pCache + 1);
                x2 = _mm_load_si128(pCache + 2);
                x3 = _mm_load_si128(pCache + 3);

                if (dst_unaligned) {
                    // movdqu
                    _mm_storeu_si128(pStore,	x0);
                    _mm_storeu_si128(pStore + 1, x1);
                    _mm_storeu_si128(pStore + 2, x2);
                    _mm_storeu_si128(pStore + 3, x3);
                } else {
                    // movntdq
                    _mm_stream_si128(pStore,	x0);
                    _mm_stream_si128(pStore + 1, x1);
                    _mm_stream_si128(pStore + 2, x2);
                    _mm_stream_si128(pStore + 3, x3);
                }
                pCache += 4;
                pStore += 4;
            }
            pCache += extraPitch;
            pStore += extraPitch;
        }
    }
#else
    Q_UNUSED(pSrc);
    Q_UNUSED(pDest);
    Q_UNUSED(pCacheBlock);
    Q_UNUSED(width);
    Q_UNUSED(height);
    Q_UNUSED(pitch);
#endif //QTAV_HAVE(SSE4_1)
}
Пример #3
0
// src is WC MMIO of GPU BAR
// dest is host memory
int memcpy_uncached_load_sse41(void *dest, const void *src, size_t n_bytes)
{
    int ret = 0;
#ifdef __SSE4_1__
    char *d = (char*)dest;
    uintptr_t d_int = (uintptr_t)d;
    const char *s = (const char *)src;
    uintptr_t s_int = (uintptr_t)s;
    size_t n = n_bytes;

    // align src to 128-bits
    if (s_int & 0xf) {
        size_t nh = min(0x10 - (s_int & 0x0f), n);
        memcpy(d, s, nh);
        d += nh; d_int += nh;
        s += nh; s_int += nh;
        n -= nh;
    }

    if (d_int & 0xf) { // dest is not aligned to 128-bits
        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
        // unroll 8
        while (n >= 8*sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            _mm_storeu_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
            _mm_storeu_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
            _mm_storeu_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
            _mm_storeu_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
            _mm_storeu_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
            _mm_storeu_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
            _mm_storeu_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
            s += 8*sizeof(__m128i);
            d += 8*sizeof(__m128i);
            n -= 8*sizeof(__m128i);
        }
        while (n >= sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            _mm_storeu_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            s += sizeof(__m128i);
            d += sizeof(__m128i);
            n -= sizeof(__m128i);
        }
    } else { // or it IS aligned
        __m128i r0,r1,r2,r3,r4,r5,r6,r7;
        // unroll 8
        while (n >= 8*sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            r1 = _mm_stream_load_si128 ((__m128i *)(s+1*sizeof(__m128i)));
            r2 = _mm_stream_load_si128 ((__m128i *)(s+2*sizeof(__m128i)));
            r3 = _mm_stream_load_si128 ((__m128i *)(s+3*sizeof(__m128i)));
            r4 = _mm_stream_load_si128 ((__m128i *)(s+4*sizeof(__m128i)));
            r5 = _mm_stream_load_si128 ((__m128i *)(s+5*sizeof(__m128i)));
            r6 = _mm_stream_load_si128 ((__m128i *)(s+6*sizeof(__m128i)));
            r7 = _mm_stream_load_si128 ((__m128i *)(s+7*sizeof(__m128i)));
            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            _mm_stream_si128((__m128i *)(d+1*sizeof(__m128i)), r1);
            _mm_stream_si128((__m128i *)(d+2*sizeof(__m128i)), r2);
            _mm_stream_si128((__m128i *)(d+3*sizeof(__m128i)), r3);
            _mm_stream_si128((__m128i *)(d+4*sizeof(__m128i)), r4);
            _mm_stream_si128((__m128i *)(d+5*sizeof(__m128i)), r5);
            _mm_stream_si128((__m128i *)(d+6*sizeof(__m128i)), r6);
            _mm_stream_si128((__m128i *)(d+7*sizeof(__m128i)), r7);
            s += 8*sizeof(__m128i);
            d += 8*sizeof(__m128i);
            n -= 8*sizeof(__m128i);
        }
        while (n >= sizeof(__m128i)) {
            r0 = _mm_stream_load_si128 ((__m128i *)(s+0*sizeof(__m128i)));
            _mm_stream_si128((__m128i *)(d+0*sizeof(__m128i)), r0);
            s += sizeof(__m128i);
            d += sizeof(__m128i);
            n -= sizeof(__m128i);
        }
    }

    if (n)
        memcpy(d, s, n);

    // fencing because of NT stores
    // potential optimization: issue only when NT stores are actually emitted
    _mm_sfence();

#else
#error "this file should be compiled with -msse4.1"
#endif
    return ret;
}
Пример #4
0
__m128i test_mm_stream_load_si128(__m128i *a) {
  // CHECK-LABEL: test_mm_stream_load_si128
  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa
  // CHECK-ASM: movntdqa
  return _mm_stream_load_si128(a);
}
Пример #5
0
__m128i test_mm_stream_load_si128(__m128i const *a) {
  // CHECK-LABEL: test_mm_stream_load_si128
  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 16, !nontemporal
  return _mm_stream_load_si128(a);
}
Пример #6
0
void	CopyFrame(void * pSrc, void * pDest, void * pCacheBlock,
	UINT width, UINT height, UINT pitch)
{
	__m128i		x0, x1, x2, x3;
	__m128i		*pLoad;
	__m128i		*pStore;
	__m128i		*pCache;
	UINT		x, y, yLoad, yStore;
	UINT		rowsPerBlock;
	UINT		width64;
	UINT		extraPitch;


	rowsPerBlock = CACHED_BUFFER_SIZE / pitch;
	width64 = (width + 63) & ~0x03f;
	extraPitch = (pitch - width64) / 16;

	pLoad = (__m128i *)pSrc;
	pStore = (__m128i *)pDest;

	//  COPY THROUGH 4KB CACHED BUFFER
	for (y = 0; y < height; y += rowsPerBlock)
	{
		//  ROWS LEFT TO COPY AT END
		if (y + rowsPerBlock > height)
			rowsPerBlock = height - y;

		pCache = (__m128i *)pCacheBlock;

		_mm_mfence();

		// LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK
		for (yLoad = 0; yLoad < rowsPerBlock; yLoad++)
		{
			// COPY A ROW, CACHE LINE AT A TIME
			for (x = 0; x < pitch; x += 64)
			{
				x0 = _mm_stream_load_si128(pLoad + 0);
				x1 = _mm_stream_load_si128(pLoad + 1);
				x2 = _mm_stream_load_si128(pLoad + 2);
				x3 = _mm_stream_load_si128(pLoad + 3);

				_mm_store_si128(pCache + 0, x0);
				_mm_store_si128(pCache + 1, x1);
				_mm_store_si128(pCache + 2, x2);
				_mm_store_si128(pCache + 3, x3);

				pCache += 4;
				pLoad += 4;
			}
		}

		_mm_mfence();

		pCache = (__m128i *)pCacheBlock;

		// STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK
		for (yStore = 0; yStore < rowsPerBlock; yStore++)
		{
			// copy a row, cache line at a time
			for (x = 0; x < width64; x += 64)
			{
				x0 = _mm_load_si128(pCache);
				x1 = _mm_load_si128(pCache + 1);
				x2 = _mm_load_si128(pCache + 2);
				x3 = _mm_load_si128(pCache + 3);

				_mm_stream_si128(pStore, x0);
				_mm_stream_si128(pStore + 1, x1);
				_mm_stream_si128(pStore + 2, x2);
				_mm_stream_si128(pStore + 3, x3);

				pCache += 4;
				pStore += 4;
			}

			pCache += extraPitch;
			pStore += extraPitch;
		}
	}
}