Exemple #1
0
VOID
MemUWriteCachelines (
  IN       UINT32 Address,
  IN       UINT8 Pattern[],
  IN       UINT16 ClCount
  )
{
  UINTN Index;
  CHAR8 *Position;
  __m128i *Src = (void *) Pattern;
  __m128i *Dest = (void *) (size_t)Address;

  Position = (void *) Pattern;

  // ssd - important: without this, the src data may get evicted from cache
  _mm_mfence ();

  for (Index = 0; Index < ClCount * 4; Index++){
    _mm_stream_si128_fs (Dest, Src);
    Src++;
    Dest++;
  }

  // ssd - might not be required, but no measurable boot time impact
  _mm_mfence ();
}
Exemple #2
0
__inline HRESULT __WaitNewSignals(
    IN PRX_BLOCK       pScanPoint,
    IN USHORT          uRetries,
    IN ULONG           VStreamMask,
    OUT FLAG           *fReachEnd
)
{
    ULONG   uSpinCount  = uRetries * 2;
    *fReachEnd = 0;
    while(uSpinCount != 0)
    {
        if (SORA_C_RXBUF_IS_VALID_EX(pScanPoint, VStreamMask))
        {
            return S_OK;
        }
        else
        {
            _mm_clflush(pScanPoint);
            _mm_mfence();
            *fReachEnd = 1;
        }
        uSpinCount--;
    }

    return E_FETCH_SIGNAL_HW_TIMEOUT;
}
Exemple #3
0
//////////////////////////////////////////////////////////////////////////
/// @brief Called when FE work is complete for this DC.
INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
{
    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
    {
        SWR_STATS_FE& stats = pDC->dynState.statsFE;

        AR_EVENT(FrontendStatsEvent(pDC->drawId,
            stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
            stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
            stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
            stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
        ));
		AR_EVENT(FrontendDrawEndEvent(pDC->drawId));

        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
    }

    if (pContext->pfnUpdateSoWriteOffset)
    {
        for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
        {
            if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
                (pDC->pState->state.soBuffer[i].soWriteEnable))
            {
                pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
            }
        }
    }

    // Ensure all streaming writes are globally visible before marking this FE done
    _mm_mfence();
    pDC->doneFE = true;

    InterlockedDecrement(&pContext->drawsOutstandingFE);
}
	void epub_sys_cache_flush(void* start, long len)
	{
#if EPUB_CPU(X86) || EPUB_CPU(X64)
		if (start == nullptr || len <= 0)
			return;

		unsigned char* p = reinterpret_cast<unsigned char*>(start);

		// ensure all reads/write complete before the next instruction
		_mm_mfence();

		// ensure the last line is flushed
		_mm_clflush(p + (len - 1));

		// flush all cache lines. lines are 64 bytes on both architectures
		while (len > 0)
		{
			_mm_clflush(p);
			p += 64;
			len -= 64;
		}
#elif EPUB_CPU(ARM)
		// no idea...
#endif
	}
Exemple #5
0
VOID
MemUMFenceInstr (
  VOID
  )
{
  _mm_mfence ();
}
Exemple #6
0
VOID __SoraHwTransferUnsafeNoWait(
    IN PTRANSFER_OBJ pTransferObj,
    IN PTX_DESC     pTxDesc)
#endif
{
    HRESULT hr = S_OK;
    __REG32_TRANS_CTRL TransCtrl;
    __PSORA_RADIO_REGS pRegs = pTransferObj->TransferReg;
    
    pTxDesc->__FrameCtrlOwn = 1; //software own the buffer
    // make sure modulation buffer is flushed into memory.
    _mm_mfence(); 
    TransCtrl.Value = 0;
    TransCtrl.Bits.TransferInit  = 1;	

#ifdef DEBUG_CHECKSUM
    WRITE_REGISTER_ULONG(
        (PULONG)&pRegs->TransferChecksum, Checksum);
#endif

    WRITE_REGISTER_ULONG(
        (PULONG)&pRegs->TransferSrcAddrL, pTxDesc->ThisPa.u.LowPart);
    WRITE_REGISTER_ULONG(
        (PULONG)&pRegs->TransferSrcAddrH, pTxDesc->ThisPa.u.HighPart);
    WRITE_REGISTER_ULONG(
        (PULONG)&pRegs->TransferControl, TransCtrl.Value);
}
Exemple #7
0
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)
{
	static const u32 PSD = 64;
	
	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
	static const __m128i round = _mm_set1_epi16(128);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % STRIDE == 0);
	assert(alpha >= 0.0 && alpha <= 1.0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i* dest128 = reinterpret_cast<__m128i*>(dest);

	__m128i s = _mm_setzero_si128();
	__m128i d = _mm_setzero_si128();
	const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));
	
	__m128i drb, dga, srb, sga;
	
	for (size_t k = 0, length = size/STRIDE; k < length; ++k)
	{		
		_mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA);	
		_mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA);
		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// r = d + (s-d)*alpha/256
			s = _mm_load_si128(source128_1);	// AABBGGRR
			d = _mm_load_si128(source128_2);	// AABBGGRR

			srb = _mm_and_si128(lomask, s);		// 00BB00RR		// unpack
			sga = _mm_srli_epi16(s, 8);			// AA00GG00		// unpack
			
			drb = _mm_and_si128(lomask, d);		// 00BB00RR		// unpack
			dga = _mm_srli_epi16(d, 8);			// AA00GG00		// unpack

			srb = _mm_sub_epi16(srb, drb);		// BBBBRRRR		// sub
			srb = _mm_mullo_epi16(srb, a);		// BBBBRRRR		// mul
			srb = _mm_add_epi16(srb, round);
			
			sga = _mm_sub_epi16(sga, dga);		// AAAAGGGG		// sub
			sga = _mm_mullo_epi16(sga, a);		// AAAAGGGG		// mul
			sga = _mm_add_epi16(sga, round);

			srb = _mm_srli_epi16(srb, 8);		// 00BB00RR		// prepack and div
			sga = _mm_andnot_si128(lomask, sga);// AA00GG00		// prepack and div

			srb = _mm_or_si128(srb, sga);		// AABBGGRR		// pack

			srb = _mm_add_epi8(srb, d);			// AABBGGRR		// add		there is no overflow(R.N)

			_mm_stream_si128(dest128, srb);
		}
	}
	_mm_mfence();	//ensure last WC buffers get flushed to memory
}
Exemple #8
0
int sys_lwcond_wait(mem_ptr_t<sys_lwcond_t> lwcond, u64 timeout)
{
	sys_lwcond.Log("sys_lwcond_wait(lwcond_addr=0x%x, timeout=%lld)", lwcond.GetAddr(), timeout);

	if (!lwcond.IsGood())
	{
		return CELL_EFAULT;
	}

	SleepQueue* sq;
	if (!Emu.GetIdManager().GetIDData((u32)lwcond->lwcond_queue, sq))
	{
		return CELL_ESRCH;
	}

	mem_ptr_t<sys_lwmutex_t> mutex(lwcond->lwmutex);
	u32 tid_le = GetCurrentPPUThread().GetId();
	be_t<u32> tid = tid_le;

	if (mutex->mutex.owner.GetOwner() != tid)
	{
		return CELL_EPERM; // caller must own this lwmutex
	}

	sq->push(tid_le);

	mutex->recursive_count = 0;
	mutex->mutex.owner.unlock(tid);

	u32 counter = 0;
	const u32 max_counter = timeout ? (timeout / 1000) : ~0;
	while (true)
	{
		/* switch (mutex->trylock(tid))
		{
		case SMR_OK: mutex->unlock(tid); break;
		case SMR_SIGNAL: return CELL_OK;
		} */
		if (mutex->mutex.owner.GetOwner() == tid)
		{
			_mm_mfence();
			mutex->recursive_count = 1;
			return CELL_OK;
		}

		Sleep(1);

		if (counter++ > max_counter)
		{
			sq->invalidate(tid_le);
			return CELL_ETIMEDOUT;
		}
		if (Emu.IsStopped())
		{
			ConLog.Warning("sys_lwcond_wait(sq=%d) aborted", (u32)lwcond->lwcond_queue);
			return CELL_OK;
		}
	}
}
Exemple #9
0
void test_mm_mfence() {
  // DAG-LABEL: test_mm_mfence
  // DAG: call void @llvm.x86.sse2.mfence()
  //
  // ASM-LABEL: test_mm_mfence
  // ASM: mfence
  _mm_mfence();
}
Exemple #10
0
void SPUThread::do_dma_transfer(u32 cmd, spu_mfc_arg_t args)
{
	if (cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK))
	{
		_mm_mfence();
	}

	u32 eal = VM_CAST(args.ea);

	if (eal >= SYS_SPU_THREAD_BASE_LOW && m_type == CPU_THREAD_SPU) // SPU Thread Group MMIO (LS and SNR)
	{
		const u32 index = (eal - SYS_SPU_THREAD_BASE_LOW) / SYS_SPU_THREAD_OFFSET; // thread number in group
		const u32 offset = (eal - SYS_SPU_THREAD_BASE_LOW) % SYS_SPU_THREAD_OFFSET; // LS offset or MMIO register

		const auto group = tg.lock();

		if (group && index < group->num && group->threads[index])
		{
			auto& spu = static_cast<SPUThread&>(*group->threads[index]);

			if (offset + args.size - 1 < 0x40000) // LS access
			{
				eal = spu.offset + offset; // redirect access
			}
			else if ((cmd & MFC_PUT_CMD) && args.size == 4 && (offset == SYS_SPU_THREAD_SNR1 || offset == SYS_SPU_THREAD_SNR2))
			{
				spu.push_snr(SYS_SPU_THREAD_SNR2 == offset, read32(args.lsa));
				return;
			}
			else
			{
				throw EXCEPTION("Invalid MMIO offset (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)", cmd, args.lsa, args.ea, args.tag, args.size);
			}
		}
		else
		{
			throw EXCEPTION("Invalid thread type (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)", cmd, args.lsa, args.ea, args.tag, args.size);
		}
	}

	switch (cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK))
	{
	case MFC_PUT_CMD:
	case MFC_PUTR_CMD:
	{
		memcpy(vm::get_ptr(eal), vm::get_ptr(offset + args.lsa), args.size);
		return;
	}

	case MFC_GET_CMD:
	{
		memcpy(vm::get_ptr(offset + args.lsa), vm::get_ptr(eal), args.size);
		return;
	}
	}

	throw EXCEPTION("Invalid command %s (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)", get_mfc_cmd_name(cmd), cmd, args.lsa, args.ea, args.tag, args.size);
}
Exemple #11
0
BOOST_FORCEINLINE void hardware_full_fence(void)
{
#if defined(_MSC_VER) && (defined(_M_AMD64) || (defined(_M_IX86) && defined(_M_IX86_FP) && _M_IX86_FP >= 2))
    // Use mfence only if SSE2 is available
    _mm_mfence();
#else
    long tmp;
    BOOST_ATOMIC_INTERLOCKED_EXCHANGE(&tmp, 0);
#endif
}
void g() {
  (void)_mm_getcsr();
  _mm_setcsr(1);
  _mm_sfence();

  _mm_clflush((void*)0);
  _mm_lfence();
  _mm_mfence();
  _mm_pause();
}
Exemple #13
0
int sys_cond_wait(u32 cond_id, u64 timeout)
{
	sys_cond.Log("sys_cond_wait(cond_id=%d, timeout=%lld)", cond_id, timeout);

	Cond* cond;
	if (!Emu.GetIdManager().GetIDData(cond_id, cond))
	{
		return CELL_ESRCH;
	}

	Mutex* mutex = cond->mutex;
	u32 tid = GetCurrentPPUThread().GetId();

	if (mutex->m_mutex.GetOwner() != tid)
	{
		return CELL_EPERM;
	}

	cond->m_queue.push(tid);

	mutex->recursive = 0;
	mutex->m_mutex.unlock(tid);

	u32 counter = 0;
	const u32 max_counter = timeout ? (timeout / 1000) : ~0;

	while (true)
	{
		/* switch (mutex->m_mutex.trylock(tid))
		{
		case SMR_OK: mutex->m_mutex.unlock(tid); break;
		case SMR_SIGNAL: mutex->recursive = 1; return CELL_OK;
		} */
		if (mutex->m_mutex.GetOwner() == tid)
		{
			_mm_mfence();
			mutex->recursive = 1;
			return CELL_OK;
		}

		Sleep(1);

		if (counter++ > max_counter)
		{
			cond->m_queue.invalidate(tid);
			return CELL_ETIMEDOUT;
		}
		if (Emu.IsStopped())
		{
			ConLog.Warning("sys_cond_wait(id=%d) aborted", cond_id);
			return CELL_OK;
		}
	}
}
Exemple #14
0
    virtual void do_send_packet(const void* data, uint32_t length) override {
        REQUIRE(length <= 1500);

        if (!(ioreg(reg::STATUS) & STATUS_LU)) {
            dbgout() << "[i825x] Link not up. Dropping packet\n";
            return;
        }

        // prepare descriptor
        auto& td = tx_desc_[tx_tail_];
        tx_tail_ = (tx_tail_ + 1) % num_tx_descriptors;
        REQUIRE(td.upper.fields.status == 0);
        td.buffer_addr = virt_to_phys(data);
        td.lower.data = length | TXD_CMD_RS | TXD_CMD_EOP | TXD_CMD_IFCS;
        td.upper.data = 0;
        _mm_mfence();
        ioreg(reg::TDT0, tx_tail_);

        //dbgout() << "Waiting for packet to be sent.\n";
        for (uint32_t timeout = 100; !td.upper.fields.status; ) {
            __halt();
            if (!timeout--) {
#if 0
                // Dump stats
                constexpr uint32_t nstats = 0x100 / 4;
                static uint32_t stats[nstats];
                for (uint32_t i = 0; i < nstats; ++i) {
                    stats[i] += ioreg(static_cast<reg>(0x4000 + i * 4));
                    dbgout() << as_hex(stats[i]);
                    dbgout() << ((i % 8 == 7) ? '\n' : ' ');
                }
#endif
                dbgout() << "Transfer NOT done. Timed out! STATUS = " << as_hex(ioreg(reg::STATUS)) << " TDH = " << ioreg(reg::TDH0) << " TDT " <<  ioreg(reg::TDT0) << "\n";
#if 0
                for (uint32_t i = 0; i < num_tx_descriptors; ++i) {
                    dbgout() << as_hex(tx_desc_[i].buffer_addr) << " ";
                    dbgout() << as_hex(tx_desc_[i].lower.data) << " ";
                    dbgout() << as_hex(tx_desc_[i].upper.data) << " ";
                    dbgout() << ((i % 3 == 2) ? '\n' : ' ');
                }
                dbgout() << "\n";
#endif
                return;
            }
        }
        //dbgout() << "[i825x] TX Status = " << as_hex(td.upper.fields.status) << "\n";
        REQUIRE(td.upper.fields.status == TXD_STAT_DD);
        td.upper.data = 0; // Mark ready for re-use
        REQUIRE(ioreg(reg::TDH0) == ioreg(reg::TDT0));

    }
Exemple #15
0
VOID
MemUReadCachelines (
  IN       UINT8 Buffer[],
  IN       UINT32 Address,
  IN       UINT16 ClCount
  )
{
  UINTN Index;
  UINT32 *Dest;

  for (Index = 0; Index < ClCount * 16; Index++) {
    Dest = (void *) &Buffer [Index * 4];
    *Dest = __readfsdword (Address + Index * 4);
    _mm_mfence ();
  }
}
void f() {
  (void)_mm_getcsr(); // expected-warning{{implicitly declaring library function '_mm_getcsr'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_getcsr'}}
  _mm_setcsr(1); // expected-warning{{implicitly declaring library function '_mm_setcsr'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_setcsr'}}
  _mm_sfence(); // expected-warning{{implicitly declaring library function '_mm_sfence'}} \
  // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_sfence'}}

  _mm_clflush((void*)0); // expected-warning{{implicitly declaring library function '_mm_clflush'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_clflush'}}
  _mm_lfence(); // expected-warning{{implicitly declaring library function '_mm_lfence'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_lfence'}}
  _mm_mfence(); // expected-warning{{implicitly declaring library function '_mm_mfence'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_mfence'}}
  _mm_pause(); // expected-warning{{implicitly declaring library function '_mm_pause'}} \
  // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_pause'}}
}
Exemple #17
0
//----------------------------------------------------------------------------
// MemUFlushPattern:
//
//  Flush a pattern of 72 bit times (per DQ) from cache.  This procedure is used
//  to ensure cache miss on the next read training.
//
//              In: Address   - Physical address to be flushed
//                  ClCount   - number of cachelines to be flushed
//FUNC_ATTRIBUTE(noinline)
VOID
MemUFlushPattern (
  IN       UINT32 Address,
  IN       UINT16 ClCount
  )
{
  UINTN Index;

  // ssd - theory: a tlb flush is needed to avoid problems with clflush
  __writemsr (0x20F, __readmsr (0x20F));

  for (Index = 0; Index < ClCount; Index++) {
    // mfence prevents speculative execution of the clflush
    _mm_mfence ();
    _mm_clflush_fs ((void *) (size_t) (Address + Index * 64));
  }
}
KOKKOS_FORCEINLINE_FUNCTION
void memory_fence()
{
#if defined( KOKKOS_ATOMICS_USE_CUDA )
  __threadfence();
#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \
      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) )
  __sync_synchronize();
#elif defined( KOKKOS_ATOMICS_USE_INTEL )
  _mm_mfence();
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
  #pragma omp flush

#else
 #error "Error: memory_fence() not defined"
#endif
}
Exemple #19
0
 inline void operator()(const tbb::blocked_range<size_t> &range) const {
     for (size_t colIndex = range.begin(); colIndex < range.end(); ++colIndex) {
         double *__restrict__ col = &A[colIndex * LDA];
         int i = 0;
         for (; i < permSize - 8; i += 2) {
             //_m_prefetchw(&col[p[8].a]);
             //_m_prefetchw(&col[p[8].b]);
             swap(col[perm[i + 0].a], col[perm[i + 0].b]);
             swap(col[perm[i + 1].a], col[perm[i + 1].b]);
         }
         for (; i < permSize; ++i) {
             const int rowIndex = perm[i].a;
             const int otherRow = perm[i].b;
             swap(col[rowIndex], col[otherRow]);
         }
     }
     _mm_mfence();
 }
Exemple #20
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any compute work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
void WorkOnCompute(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint32_t& curDrawBE)
{
    uint32_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
    {
        return;
    }

    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;

    for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
        if (pDC->isCompute == false) return;

        // check dependencies
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return;
        }

        SWR_ASSERT(pDC->pDispatch != nullptr);
        DispatchQueue& queue = *pDC->pDispatch;

        // Is there any work remaining?
        if (queue.getNumQueued() > 0)
        {
            void* pSpillFillBuffer = nullptr;
            void* pScratchSpace = nullptr;
            uint32_t threadGroupId = 0;
            while (queue.getWork(threadGroupId))
            {
                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
                queue.finishedWork();
            }

            // Ensure all streaming writes are globally visible before moving onto the next draw
            _mm_mfence();
        }
    }
}
Exemple #21
0
void Shuffle_SSSE3(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha)
{
	static const unsigned int PSD = 64;	

	assert(source != NULL && dest != NULL);
	assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4 && "Invalid mask");
	assert(size % STRIDE == 0);

	const __m128i* source128 = reinterpret_cast<const __m128i*>(source);
	__m128i*	   dest128 = reinterpret_cast<__m128i*>(dest);	

	__m128i reg0 = _mm_setzero_si128();	
	__m128i reg1 = _mm_setzero_si128();	
	__m128i reg2 = _mm_setzero_si128();	
	__m128i reg3 = _mm_setzero_si128();	

	const __m128i mask128 = _mm_set_epi8(alpha+12, blue+12, green+12, red+12, alpha+8, blue+8, green+8, red+8, alpha+4, blue+4, green+4, red+4, alpha, blue, green, red);

	for(size_t k = 0, length = size/STRIDE; k < length; ++k)	
	{
		// TODO: put prefetch between calculations?(R.N)
		_mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA);

		// work on entire cacheline before next prefetch

		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		reg0 = _mm_load_si128(source128++);	
		reg1 = _mm_load_si128(source128++);	

		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg0, mask128));

		reg2 = _mm_load_si128(source128++);	

		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg1, mask128));

		reg3 = _mm_load_si128(source128++);	
		
		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg2, mask128));	
		_mm_stream_si128(dest128++, _mm_shuffle_epi8(reg3, mask128));		
	}
	_mm_mfence();	//ensure last WC buffers get flushed to memory
}
Exemple #22
0
void SPUThread::dmaTransfer(U32 cmd, U32 eal, U32 lsa, U32 size) {
    if (cmd & (MFC_BARRIER_ENABLE | MFC_FENCE_ENABLE)) {
#ifdef NUCLEUS_ARCH_X86
        _mm_mfence();
#endif
    }

    const auto& memory = parent->memory;
    switch (cmd & ~(MFC_BARRIER_ENABLE | MFC_FENCE_ENABLE)) {
    case MFC_PUT_CMD:
    case MFC_PUTR_CMD:
        memcpy(memory->ptr(eal), memory->ptr(lsa), size);
        break;
    case MFC_GET_CMD:
        memcpy(memory->ptr(lsa), memory->ptr(eal), size);
        break;
    default:
        assert_true("Unexpected");
    }
}
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
    if (!KNOB_SINGLE_THREADED)
    {
        // Inform threads to finish up
        std::unique_lock<std::mutex> lock(pContext->WaitLock);
        pPool->inThreadShutdown = true;
        _mm_mfence();
        pContext->FifosNotEmpty.notify_all();
        lock.unlock();

        // Wait for threads to finish and destroy them
        for (uint32_t t = 0; t < pPool->numThreads; ++t)
        {
            pPool->threads[t]->join();
            delete(pPool->threads[t]);
        }

        // Clean up data used by threads
        free(pPool->pThreadData);
    }
}
Exemple #24
0
int cellSyncMutexLock(mem_ptr_t<CellSyncMutex> mutex)
{
	cellSync->Log("cellSyncMutexLock(mutex=0x%x)", mutex.GetAddr());

	if (!mutex.IsGood())
	{
		return CELL_SYNC_ERROR_NULL_POINTER;
	}
	if (mutex.GetAddr() % 4)
	{
		return CELL_SYNC_ERROR_ALIGN;
	}

	be_t<u16> old_order;
	while (true)
	{
		const u32 old_data = mutex->m_data();
		CellSyncMutex new_mutex;
		new_mutex.m_data() = old_data;

		old_order = new_mutex.m_order;
		new_mutex.m_order++;
		if (InterlockedCompareExchange(&mutex->m_data(), new_mutex.m_data(), old_data) == old_data) break;
	}

	while (old_order != mutex->m_freed) 
	{
		Sleep(1);
		if (Emu.IsStopped())
		{
			ConLog.Warning("cellSyncMutexLock(mutex=0x%x) aborted", mutex.GetAddr());
			break;
		}
	}
	_mm_mfence();
	return CELL_OK;
}
Exemple #25
0
void f0() {
  signed char         tmp_c;
//  unsigned char       tmp_Uc;
  signed short        tmp_s;
#ifdef USE_ALL
  unsigned short      tmp_Us;
#endif
  signed int          tmp_i;
  unsigned int        tmp_Ui;
  signed long long    tmp_LLi;
  unsigned long long  tmp_ULLi;
  float               tmp_f;
  double              tmp_d;

  void*          tmp_vp;
  const void*    tmp_vCp;
  char*          tmp_cp; 
  const char*    tmp_cCp; 
  int*           tmp_ip;
  float*         tmp_fp;
  const float*   tmp_fCp;
  double*        tmp_dp;
  const double*  tmp_dCp;
  long long*     tmp_LLip;

#define imm_i 32
#define imm_i_0_2 0
#define imm_i_0_4 3
#define imm_i_0_8 7
#define imm_i_0_16 15
  // Check this.
#define imm_i_0_256 0

  V2i*   tmp_V2ip;
  V1LLi* tmp_V1LLip;
  V2LLi* tmp_V2LLip;

  // 64-bit
  V8c    tmp_V8c;
  V4s    tmp_V4s;
  V2i    tmp_V2i;
  V1LLi  tmp_V1LLi;
#ifdef USE_3DNOW
  V2f    tmp_V2f;
#endif

  // 128-bit
  V16c   tmp_V16c;
  V8s    tmp_V8s;
  V4i    tmp_V4i;
  V2LLi  tmp_V2LLi;
  V4f    tmp_V4f;
  V2d    tmp_V2d;
  V2d*   tmp_V2dp;
  V4f*   tmp_V4fp;
  const V2d* tmp_V2dCp;
  const V4f* tmp_V4fCp;

  // 256-bit
  V32c   tmp_V32c;
  V4d    tmp_V4d;
  V8f    tmp_V8f;
  V4LLi  tmp_V4LLi;
  V8i    tmp_V8i;
  V4LLi* tmp_V4LLip;
  V4d*   tmp_V4dp;
  V8f*   tmp_V8fp;
  const V4d* tmp_V4dCp;
  const V8f* tmp_V8fCp;

  tmp_V2LLi = __builtin_ia32_undef128();
  tmp_V4LLi = __builtin_ia32_undef256();

  tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6);
  tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6);
  tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7);
  tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f);
  tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f);

  tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c);
  tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6);
  tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6);
  tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7);
  tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d);
  tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d);
  tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c);
  tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s);
  tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s);
  tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f);
  tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d);
  tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s);
  tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i);
  tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s);
  tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c);
  tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c);
  tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c);
  tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s);
  tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s);
  tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i);
  tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i);
  tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c);
  tmp_V8c = __builtin_ia32_pabsb(tmp_V8c);
  tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s);
  tmp_V4s = __builtin_ia32_pabsw(tmp_V4s);
  tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i);
  tmp_V2i = __builtin_ia32_pabsd(tmp_V2i);
  tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi);
  tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi);
  tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi);
  tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s);
  tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s);
  tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i);
  tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s);
  tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0);

  __builtin_ia32_incsspd(tmp_Ui);
  __builtin_ia32_incsspq(tmp_ULLi);
  tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui);
  tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi);
  __builtin_ia32_saveprevssp();
  __builtin_ia32_rstorssp(tmp_vp);
  __builtin_ia32_wrssd(tmp_Ui, tmp_vp);
  __builtin_ia32_wrssq(tmp_ULLi, tmp_vp);
  __builtin_ia32_wrussd(tmp_Ui, tmp_vp);
  __builtin_ia32_wrussq(tmp_ULLi, tmp_vp);
  __builtin_ia32_setssbsy();
  __builtin_ia32_clrssbsy(tmp_vp);

  (void) __builtin_ia32_ldmxcsr(tmp_Ui);
  (void) _mm_setcsr(tmp_Ui);
  tmp_Ui = __builtin_ia32_stmxcsr();
  tmp_Ui = _mm_getcsr();
  (void)__builtin_ia32_fxsave(tmp_vp);
  (void)__builtin_ia32_fxsave64(tmp_vp);
  (void)__builtin_ia32_fxrstor(tmp_vp);
  (void)__builtin_ia32_fxrstor64(tmp_vp);

  (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi);
  (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi);

  (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_clzero(tmp_vp);
  (void) __builtin_ia32_cldemote(tmp_vp);

  tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
  tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
  tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
  tmp_i = __builtin_ia32_cvttss2si(tmp_V4f);

  tmp_i = __builtin_ia32_rdtsc();
  tmp_i = __rdtsc();
  tmp_i = __builtin_ia32_rdtscp(&tmp_Ui);
  tmp_LLi = __builtin_ia32_rdpmc(tmp_i);
  __builtin_ia32_wbnoinvd();
#ifdef USE_64
  tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f);
  tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f);
#endif
  tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
  (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
  (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f);
  (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
  tmp_i = __builtin_ia32_movmskps(tmp_V4f);
  tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
  (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
  (void) __builtin_ia32_sfence();
  (void) _mm_sfence();

  tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c);
  tmp_V4f = __builtin_ia32_rcpps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rcpss(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f);
  tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f);
  tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
  tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
  (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
  tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
  tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);
  (void) __builtin_ia32_movnti(tmp_ip, tmp_i);
#ifdef USE_64
  (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
#endif
  tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
  tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
  tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d);
  tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d);
  tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d);
  tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i);
  tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d);
  tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d);
  tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d);
#ifdef USE_64
  tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d);
  tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d);
#endif
  tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
  tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
  (void) __builtin_ia32_clflush(tmp_vCp);
  (void) _mm_clflush(tmp_vCp);
  (void) __builtin_ia32_lfence();
  (void) _mm_lfence();
  (void) __builtin_ia32_mfence();
  (void) _mm_mfence();
  (void) __builtin_ia32_pause();
  (void) _mm_pause();
  tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
  tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i);
  tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i);
  tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i);
  tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i);
  tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i);
  tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i);
  tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi);
  tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s);
  tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i);
  tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi);
  tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i);
  tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i);
  tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i);
  tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i);
  tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i);
  tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i);
  tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s);
  (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui);
  (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui);
  tmp_V16c = __builtin_ia32_lddqu(tmp_cCp);
  tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i);
  tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i);
#ifdef USE_SSE4
  tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c);
  tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d);
  tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f);
  tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i);
  tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c);
  tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s);
  tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c);
  tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
  tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
  tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i);
  tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16);
  tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
  tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
  tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16);
  tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256);
#endif

  tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f);
  tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f);
  tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi);
  tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i);
  tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi);
  tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i);
  tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d);
  tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f);
  tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
  tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
  tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
  tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);
  tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f);
  tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d);
  tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d);
  tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f);
  tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7);
  tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7);
  tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7);
  tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d);
  tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f);
  tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f);
  tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f);
  tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1);
  tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1);
  tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d);
  tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f);
  tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d);
  tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f);
  tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi);
  tmp_i = __builtin_ia32_movmskpd256(tmp_V4d);
  tmp_i = __builtin_ia32_movmskps256(tmp_V8f);
  __builtin_ia32_vzeroall();
  __builtin_ia32_vzeroupper();
  tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
  tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
  tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
  tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);
  tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i);
  __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d);
  __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f);
  __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d);
  __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f);

#ifdef USE_3DNOW
  tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c);
  tmp_V2i = __builtin_ia32_pf2id(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f);
  tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i);
  tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s);
  tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f);
  tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f);
  tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i);
  tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f);
  tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i);

  tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4);
  tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i);
  tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i);
#endif
}
Exemple #26
0
error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64 timeout)
{
	sysPrxForUser.trace("sys_lwmutex_lock(lwmutex=*0x%x, timeout=0x%llx)", lwmutex, timeout);

	if (g_cfg.core.hle_lwmutex)
	{
		return sys_mutex_lock(ppu, lwmutex->sleep_queue, timeout);
	}

	const be_t<u32> tid(ppu.id);

	// try to lock lightweight mutex
	const be_t<u32> old_owner = lwmutex->vars.owner.compare_and_swap(lwmutex_free, tid);

	if (old_owner == lwmutex_free)
	{
		// locking succeeded
		return CELL_OK;
	}

	if (old_owner == tid)
	{
		// recursive locking

		if ((lwmutex->attribute & SYS_SYNC_RECURSIVE) == 0)
		{
			// if not recursive
			return CELL_EDEADLK;
		}

		if (lwmutex->recursive_count == -1)
		{
			// if recursion limit reached
			return CELL_EKRESOURCE;
		}

		// recursive locking succeeded
		lwmutex->recursive_count++;
		_mm_mfence();

		return CELL_OK;
	}

	if (old_owner == lwmutex_dead)
	{
		// invalid or deleted mutex
		return CELL_EINVAL;
	}

	for (u32 i = 0; i < 10; i++)
	{
		busy_wait();

		if (lwmutex->vars.owner.load() == lwmutex_free)
		{
			if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid))
			{
				// locking succeeded
				return CELL_OK;
			}
		}
	}

	// atomically increment waiter value using 64 bit op
	lwmutex->all_info++;

	if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid))
	{
		// locking succeeded
		--lwmutex->all_info;

		return CELL_OK;
	}

	// lock using the syscall
	const error_code res = _sys_lwmutex_lock(ppu, lwmutex->sleep_queue, timeout);

	lwmutex->all_info--;

	if (res == CELL_OK)
	{
		// locking succeeded
		auto old = lwmutex->vars.owner.exchange(tid);

		if (old != lwmutex_reserved)
		{
			fmt::throw_exception("Locking failed (lwmutex=*0x%x, owner=0x%x)" HERE, lwmutex, old);
		}

		return CELL_OK;
	}

	if (res == CELL_EBUSY && lwmutex->attribute & SYS_SYNC_RETRY)
	{
		while (true)
		{
			for (u32 i = 0; i < 10; i++)
			{
				busy_wait();

				if (lwmutex->vars.owner.load() == lwmutex_free)
				{
					if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid))
					{
						return CELL_OK;
					}
				}
			}

			lwmutex->all_info++;

			if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid))
			{
				lwmutex->all_info--;
				return CELL_OK;
			}

			const u64 time0 = timeout ? get_system_time() : 0;

			const error_code res_ = _sys_lwmutex_lock(ppu, lwmutex->sleep_queue, timeout);

			if (res_ == CELL_OK)
			{
				lwmutex->vars.owner.release(tid);
			}
			else if (timeout && res_ != CELL_ETIMEDOUT)
			{
				const u64 time_diff = get_system_time() - time0;

				if (timeout <= time_diff)
				{
					lwmutex->all_info--;
					return not_an_error(CELL_ETIMEDOUT);
				}

				timeout -= time_diff;
			}

			lwmutex->all_info--;

			if (res_ != CELL_EBUSY)
			{
				return res_;
			}
		}
	}

	return res;
}
Exemple #27
0
error_code sys_lwmutex_trylock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex)
{
	sysPrxForUser.trace("sys_lwmutex_trylock(lwmutex=*0x%x)", lwmutex);

	if (g_cfg.core.hle_lwmutex)
	{
		return sys_mutex_trylock(ppu, lwmutex->sleep_queue);
	}

	const be_t<u32> tid(ppu.id);

	// try to lock lightweight mutex
	const be_t<u32> old_owner = lwmutex->vars.owner.compare_and_swap(lwmutex_free, tid);

	if (old_owner == lwmutex_free)
	{
		// locking succeeded
		return CELL_OK;
	}

	if (old_owner == tid)
	{
		// recursive locking

		if ((lwmutex->attribute & SYS_SYNC_RECURSIVE) == 0)
		{
			// if not recursive
			return CELL_EDEADLK;
		}

		if (lwmutex->recursive_count == -1)
		{
			// if recursion limit reached
			return CELL_EKRESOURCE;
		}

		// recursive locking succeeded
		lwmutex->recursive_count++;
		_mm_mfence();

		return CELL_OK;
	}

	if (old_owner == lwmutex_dead)
	{
		// invalid or deleted mutex
		return CELL_EINVAL;
	}

	if (old_owner == lwmutex_reserved)
	{
		// should be locked by the syscall
		const error_code res = _sys_lwmutex_trylock(lwmutex->sleep_queue);

		if (res == CELL_OK)
		{
			// locking succeeded
			auto old = lwmutex->vars.owner.exchange(tid);

			if (old != lwmutex_reserved)
			{
				fmt::throw_exception("Locking failed (lwmutex=*0x%x, owner=0x%x)" HERE, lwmutex, old);
			}
		}

		return res;
	}

	// locked by another thread
	return not_an_error(CELL_EBUSY);
}
Exemple #28
0
bool spu_thread::write_reg(const u32 addr, const u32 value)
{
	auto try_start = [this]()
	{
		if (status.atomic_op([](u32& status)
		{
			if (status & SPU_STATUS_RUNNING)
			{
				return false;
			}

			status = SPU_STATUS_RUNNING;
			return true;
		}))
		{
			state -= cpu_flag::stop;
			thread_ctrl::notify(static_cast<named_thread<spu_thread>&>(*this));
		}
	};

	const u32 offset = addr - RAW_SPU_BASE_ADDR - index * RAW_SPU_OFFSET - RAW_SPU_PROB_OFFSET;

	switch (offset)
	{
	case MFC_LSA_offs:
	{
		if (value >= 0x40000)
		{
			break;
		}

		g_tls_mfc[index].lsa = value;
		return true;
	}

	case MFC_EAH_offs:
	{
		g_tls_mfc[index].eah = value;
		return true;
	}

	case MFC_EAL_offs:
	{
		g_tls_mfc[index].eal = value;
		return true;
	}

	case MFC_Size_Tag_offs:
	{
		g_tls_mfc[index].tag = value & 0x1f;
		g_tls_mfc[index].size = (value >> 16) & 0x7fff;
		return true;
	}

	case MFC_Class_CMD_offs:
	{
		g_tls_mfc[index].cmd = MFC(value & 0xff);

		switch (value & 0xff)
		{
		case MFC_SNDSIG_CMD:
		case MFC_SNDSIGB_CMD:
		case MFC_SNDSIGF_CMD:
		{
			g_tls_mfc[index].size = 4;
			// Fallthrough
		}
		case MFC_PUT_CMD:
		case MFC_PUTB_CMD:
		case MFC_PUTF_CMD:
		case MFC_PUTS_CMD:
		case MFC_PUTBS_CMD:
		case MFC_PUTFS_CMD:
		case MFC_GET_CMD:
		case MFC_GETB_CMD:
		case MFC_GETF_CMD:
		case MFC_GETS_CMD:
		case MFC_GETBS_CMD:
		case MFC_GETFS_CMD:
		{
			if (g_tls_mfc[index].size)
			{
				// Perform transfer immediately
				do_dma_transfer(g_tls_mfc[index]);
			}

			// .cmd should be zero, which is equal to MFC_PPU_DMA_CMD_ENQUEUE_SUCCESSFUL
			g_tls_mfc[index] = {};

			if (value & MFC_START_MASK)
			{
				try_start();
			}

			return true;
		}
		case MFC_BARRIER_CMD:
		case MFC_EIEIO_CMD:
		case MFC_SYNC_CMD:
		{
			g_tls_mfc[index] = {};
			_mm_mfence();
			return true;
		}
		}

		break;
	}

	case Prxy_QueryType_offs:
	{
		// TODO
		// 0 - no query requested; cancel previous request
		// 1 - set (interrupt) status upon completion of any enabled tag groups
		// 2 - set (interrupt) status upon completion of all enabled tag groups

		if (value > 2)
		{
			break;
		}

		if (value)
		{
			int_ctrl[2].set(SPU_INT2_STAT_DMA_TAG_GROUP_COMPLETION_INT); // TODO
		}

		return true;
	}

	case Prxy_QueryMask_offs:
	{
		mfc_prxy_mask = value;
		return true;
	}

	case SPU_In_MBox_offs:
	{
		ch_in_mbox.push(*this, value);
		return true;
	}

	case SPU_RunCntl_offs:
	{
		if (value == SPU_RUNCNTL_RUN_REQUEST)
		{
			try_start();
		}
		else if (value == SPU_RUNCNTL_STOP_REQUEST)
		{
			status &= ~SPU_STATUS_RUNNING;
			state += cpu_flag::stop;
		}
		else
		{
			break;
		}

		run_ctrl = value;
		return true;
	}

	case SPU_NPC_offs:
	{
		if ((value & 2) || value >= 0x40000)
		{
			break;
		}

		npc = value;
		return true;
	}

	case SPU_RdSigNotify1_offs:
	{
		push_snr(0, value);
		return true;
	}

	case SPU_RdSigNotify2_offs:
	{
		push_snr(1, value);
		return true;
	}
	}

	LOG_ERROR(SPU, "RawSPUThread[%d]: Write32(0x%x, value=0x%x): unknown/illegal offset (0x%x)", index, addr, value, offset);
	return false;
}
Exemple #29
0
void spu_interpreter::DSYNC(SPUThread& CPU, spu_opcode_t op)
{
	_mm_mfence();
}
Exemple #30
0
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch)
{
#if QTAV_HAVE(SSE4_1)
    //assert(((intptr_t)pCacheBlock & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
    __m128i		x0, x1, x2, x3;
    __m128i		*pLoad;
    __m128i		*pStore;
    __m128i		*pCache;
    UINT		x, y, yLoad, yStore;
    UINT		rowsPerBlock;
    UINT		width64;
    UINT		extraPitch;

    rowsPerBlock = CACHED_BUFFER_SIZE / pitch;
    width64 = (width + 63) & ~0x03f;
    extraPitch = (pitch - width64) / 16;

    pLoad  = (__m128i *)pSrc;
    pStore = (__m128i *)pDest;

    const bool src_unaligned = ((intptr_t)pSrc) & 0x0f;
    const bool dst_unaligned = ((intptr_t)pDest & 0x0f);
    //if (src_unaligned || dst_unaligned)
      //  qDebug("===========unaligned: src %d, dst: %d,  extraPitch: %d", src_unaligned, dst_unaligned, extraPitch);
    //  COPY THROUGH 4KB CACHED BUFFER
    for (y = 0; y < height; y += rowsPerBlock) {
        //  ROWS LEFT TO COPY AT END
        if (y + rowsPerBlock > height)
            rowsPerBlock = height - y;

        pCache = (__m128i *)pCacheBlock;

        _mm_mfence();

        // LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK
        for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) {
            // COPY A ROW, CACHE LINE AT A TIME
            for (x = 0; x < pitch; x +=64) {
                // movntdqa
                x0 = _mm_stream_load_si128(pLoad + 0);
                x1 = _mm_stream_load_si128(pLoad + 1);
                x2 = _mm_stream_load_si128(pLoad + 2);
                x3 = _mm_stream_load_si128(pLoad + 3);

                if (src_unaligned) {
                    // movdqu
                    _mm_storeu_si128(pCache +0, x0);
                    _mm_storeu_si128(pCache +1, x1);
                    _mm_storeu_si128(pCache +2, x2);
                    _mm_storeu_si128(pCache +3, x3);
                } else {
                    // movdqa
                    _mm_store_si128(pCache +0, x0);
                    _mm_store_si128(pCache +1, x1);
                    _mm_store_si128(pCache +2, x2);
                    _mm_store_si128(pCache +3, x3);
                }
                pCache += 4;
                pLoad += 4;
            }
        }

        _mm_mfence();

        pCache = (__m128i *)pCacheBlock;
        // STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK
        for (yStore = 0; yStore < rowsPerBlock; yStore++) {
            // copy a row, cache line at a time
            for (x = 0; x < width64; x += 64) {
                // movdqa
                x0 = _mm_load_si128(pCache);
                x1 = _mm_load_si128(pCache + 1);
                x2 = _mm_load_si128(pCache + 2);
                x3 = _mm_load_si128(pCache + 3);

                if (dst_unaligned) {
                    // movdqu
                    _mm_storeu_si128(pStore,	x0);
                    _mm_storeu_si128(pStore + 1, x1);
                    _mm_storeu_si128(pStore + 2, x2);
                    _mm_storeu_si128(pStore + 3, x3);
                } else {
                    // movntdq
                    _mm_stream_si128(pStore,	x0);
                    _mm_stream_si128(pStore + 1, x1);
                    _mm_stream_si128(pStore + 2, x2);
                    _mm_stream_si128(pStore + 3, x3);
                }
                pCache += 4;
                pStore += 4;
            }
            pCache += extraPitch;
            pStore += extraPitch;
        }
    }
#else
    Q_UNUSED(pSrc);
    Q_UNUSED(pDest);
    Q_UNUSED(pCacheBlock);
    Q_UNUSED(width);
    Q_UNUSED(height);
    Q_UNUSED(pitch);
#endif //QTAV_HAVE(SSE4_1)
}