VOID MemUWriteCachelines ( IN UINT32 Address, IN UINT8 Pattern[], IN UINT16 ClCount ) { UINTN Index; CHAR8 *Position; __m128i *Src = (void *) Pattern; __m128i *Dest = (void *) (size_t)Address; Position = (void *) Pattern; // ssd - important: without this, the src data may get evicted from cache _mm_mfence (); for (Index = 0; Index < ClCount * 4; Index++){ _mm_stream_si128_fs (Dest, Src); Src++; Dest++; } // ssd - might not be required, but no measurable boot time impact _mm_mfence (); }
__inline HRESULT __WaitNewSignals( IN PRX_BLOCK pScanPoint, IN USHORT uRetries, IN ULONG VStreamMask, OUT FLAG *fReachEnd ) { ULONG uSpinCount = uRetries * 2; *fReachEnd = 0; while(uSpinCount != 0) { if (SORA_C_RXBUF_IS_VALID_EX(pScanPoint, VStreamMask)) { return S_OK; } else { _mm_clflush(pScanPoint); _mm_mfence(); *fReachEnd = 1; } uSpinCount--; } return E_FETCH_SIGNAL_HW_TIMEOUT; }
////////////////////////////////////////////////////////////////////////// /// @brief Called when FE work is complete for this DC. INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) { if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE) { SWR_STATS_FE& stats = pDC->dynState.statsFE; AR_EVENT(FrontendStatsEvent(pDC->drawId, stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations, stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives, stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3], stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3] )); AR_EVENT(FrontendDrawEndEvent(pDC->drawId)); pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats); } if (pContext->pfnUpdateSoWriteOffset) { for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i) { if ((pDC->dynState.SoWriteOffsetDirty[i]) && (pDC->pState->state.soBuffer[i].soWriteEnable)) { pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); } } } // Ensure all streaming writes are globally visible before marking this FE done _mm_mfence(); pDC->doneFE = true; InterlockedDecrement(&pContext->drawsOutstandingFE); }
void epub_sys_cache_flush(void* start, long len) { #if EPUB_CPU(X86) || EPUB_CPU(X64) if (start == nullptr || len <= 0) return; unsigned char* p = reinterpret_cast<unsigned char*>(start); // ensure all reads/write complete before the next instruction _mm_mfence(); // ensure the last line is flushed _mm_clflush(p + (len - 1)); // flush all cache lines. lines are 64 bytes on both architectures while (len > 0) { _mm_clflush(p); p += 64; len -= 64; } #elif EPUB_CPU(ARM) // no idea... #endif }
VOID MemUMFenceInstr ( VOID ) { _mm_mfence (); }
VOID __SoraHwTransferUnsafeNoWait( IN PTRANSFER_OBJ pTransferObj, IN PTX_DESC pTxDesc) #endif { HRESULT hr = S_OK; __REG32_TRANS_CTRL TransCtrl; __PSORA_RADIO_REGS pRegs = pTransferObj->TransferReg; pTxDesc->__FrameCtrlOwn = 1; //software own the buffer // make sure modulation buffer is flushed into memory. _mm_mfence(); TransCtrl.Value = 0; TransCtrl.Bits.TransferInit = 1; #ifdef DEBUG_CHECKSUM WRITE_REGISTER_ULONG( (PULONG)&pRegs->TransferChecksum, Checksum); #endif WRITE_REGISTER_ULONG( (PULONG)&pRegs->TransferSrcAddrL, pTxDesc->ThisPa.u.LowPart); WRITE_REGISTER_ULONG( (PULONG)&pRegs->TransferSrcAddrH, pTxDesc->ThisPa.u.HighPart); WRITE_REGISTER_ULONG( (PULONG)&pRegs->TransferControl, TransCtrl.Value); }
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % STRIDE == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/STRIDE; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_stream_si128(dest128, srb); } } _mm_mfence(); //ensure last WC buffers get flushed to memory }
int sys_lwcond_wait(mem_ptr_t<sys_lwcond_t> lwcond, u64 timeout) { sys_lwcond.Log("sys_lwcond_wait(lwcond_addr=0x%x, timeout=%lld)", lwcond.GetAddr(), timeout); if (!lwcond.IsGood()) { return CELL_EFAULT; } SleepQueue* sq; if (!Emu.GetIdManager().GetIDData((u32)lwcond->lwcond_queue, sq)) { return CELL_ESRCH; } mem_ptr_t<sys_lwmutex_t> mutex(lwcond->lwmutex); u32 tid_le = GetCurrentPPUThread().GetId(); be_t<u32> tid = tid_le; if (mutex->mutex.owner.GetOwner() != tid) { return CELL_EPERM; // caller must own this lwmutex } sq->push(tid_le); mutex->recursive_count = 0; mutex->mutex.owner.unlock(tid); u32 counter = 0; const u32 max_counter = timeout ? (timeout / 1000) : ~0; while (true) { /* switch (mutex->trylock(tid)) { case SMR_OK: mutex->unlock(tid); break; case SMR_SIGNAL: return CELL_OK; } */ if (mutex->mutex.owner.GetOwner() == tid) { _mm_mfence(); mutex->recursive_count = 1; return CELL_OK; } Sleep(1); if (counter++ > max_counter) { sq->invalidate(tid_le); return CELL_ETIMEDOUT; } if (Emu.IsStopped()) { ConLog.Warning("sys_lwcond_wait(sq=%d) aborted", (u32)lwcond->lwcond_queue); return CELL_OK; } } }
void test_mm_mfence() { // DAG-LABEL: test_mm_mfence // DAG: call void @llvm.x86.sse2.mfence() // // ASM-LABEL: test_mm_mfence // ASM: mfence _mm_mfence(); }
void SPUThread::do_dma_transfer(u32 cmd, spu_mfc_arg_t args) { if (cmd & (MFC_BARRIER_MASK | MFC_FENCE_MASK)) { _mm_mfence(); } u32 eal = VM_CAST(args.ea); if (eal >= SYS_SPU_THREAD_BASE_LOW && m_type == CPU_THREAD_SPU) // SPU Thread Group MMIO (LS and SNR) { const u32 index = (eal - SYS_SPU_THREAD_BASE_LOW) / SYS_SPU_THREAD_OFFSET; // thread number in group const u32 offset = (eal - SYS_SPU_THREAD_BASE_LOW) % SYS_SPU_THREAD_OFFSET; // LS offset or MMIO register const auto group = tg.lock(); if (group && index < group->num && group->threads[index]) { auto& spu = static_cast<SPUThread&>(*group->threads[index]); if (offset + args.size - 1 < 0x40000) // LS access { eal = spu.offset + offset; // redirect access } else if ((cmd & MFC_PUT_CMD) && args.size == 4 && (offset == SYS_SPU_THREAD_SNR1 || offset == SYS_SPU_THREAD_SNR2)) { spu.push_snr(SYS_SPU_THREAD_SNR2 == offset, read32(args.lsa)); return; } else { throw EXCEPTION("Invalid MMIO offset (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)", cmd, args.lsa, args.ea, args.tag, args.size); } } else { throw EXCEPTION("Invalid thread type (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)", cmd, args.lsa, args.ea, args.tag, args.size); } } switch (cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK)) { case MFC_PUT_CMD: case MFC_PUTR_CMD: { memcpy(vm::get_ptr(eal), vm::get_ptr(offset + args.lsa), args.size); return; } case MFC_GET_CMD: { memcpy(vm::get_ptr(offset + args.lsa), vm::get_ptr(eal), args.size); return; } } throw EXCEPTION("Invalid command %s (cmd=0x%x, lsa=0x%x, ea=0x%llx, tag=0x%x, size=0x%x)", get_mfc_cmd_name(cmd), cmd, args.lsa, args.ea, args.tag, args.size); }
BOOST_FORCEINLINE void hardware_full_fence(void) { #if defined(_MSC_VER) && (defined(_M_AMD64) || (defined(_M_IX86) && defined(_M_IX86_FP) && _M_IX86_FP >= 2)) // Use mfence only if SSE2 is available _mm_mfence(); #else long tmp; BOOST_ATOMIC_INTERLOCKED_EXCHANGE(&tmp, 0); #endif }
void g() { (void)_mm_getcsr(); _mm_setcsr(1); _mm_sfence(); _mm_clflush((void*)0); _mm_lfence(); _mm_mfence(); _mm_pause(); }
int sys_cond_wait(u32 cond_id, u64 timeout) { sys_cond.Log("sys_cond_wait(cond_id=%d, timeout=%lld)", cond_id, timeout); Cond* cond; if (!Emu.GetIdManager().GetIDData(cond_id, cond)) { return CELL_ESRCH; } Mutex* mutex = cond->mutex; u32 tid = GetCurrentPPUThread().GetId(); if (mutex->m_mutex.GetOwner() != tid) { return CELL_EPERM; } cond->m_queue.push(tid); mutex->recursive = 0; mutex->m_mutex.unlock(tid); u32 counter = 0; const u32 max_counter = timeout ? (timeout / 1000) : ~0; while (true) { /* switch (mutex->m_mutex.trylock(tid)) { case SMR_OK: mutex->m_mutex.unlock(tid); break; case SMR_SIGNAL: mutex->recursive = 1; return CELL_OK; } */ if (mutex->m_mutex.GetOwner() == tid) { _mm_mfence(); mutex->recursive = 1; return CELL_OK; } Sleep(1); if (counter++ > max_counter) { cond->m_queue.invalidate(tid); return CELL_ETIMEDOUT; } if (Emu.IsStopped()) { ConLog.Warning("sys_cond_wait(id=%d) aborted", cond_id); return CELL_OK; } } }
virtual void do_send_packet(const void* data, uint32_t length) override { REQUIRE(length <= 1500); if (!(ioreg(reg::STATUS) & STATUS_LU)) { dbgout() << "[i825x] Link not up. Dropping packet\n"; return; } // prepare descriptor auto& td = tx_desc_[tx_tail_]; tx_tail_ = (tx_tail_ + 1) % num_tx_descriptors; REQUIRE(td.upper.fields.status == 0); td.buffer_addr = virt_to_phys(data); td.lower.data = length | TXD_CMD_RS | TXD_CMD_EOP | TXD_CMD_IFCS; td.upper.data = 0; _mm_mfence(); ioreg(reg::TDT0, tx_tail_); //dbgout() << "Waiting for packet to be sent.\n"; for (uint32_t timeout = 100; !td.upper.fields.status; ) { __halt(); if (!timeout--) { #if 0 // Dump stats constexpr uint32_t nstats = 0x100 / 4; static uint32_t stats[nstats]; for (uint32_t i = 0; i < nstats; ++i) { stats[i] += ioreg(static_cast<reg>(0x4000 + i * 4)); dbgout() << as_hex(stats[i]); dbgout() << ((i % 8 == 7) ? '\n' : ' '); } #endif dbgout() << "Transfer NOT done. Timed out! STATUS = " << as_hex(ioreg(reg::STATUS)) << " TDH = " << ioreg(reg::TDH0) << " TDT " << ioreg(reg::TDT0) << "\n"; #if 0 for (uint32_t i = 0; i < num_tx_descriptors; ++i) { dbgout() << as_hex(tx_desc_[i].buffer_addr) << " "; dbgout() << as_hex(tx_desc_[i].lower.data) << " "; dbgout() << as_hex(tx_desc_[i].upper.data) << " "; dbgout() << ((i % 3 == 2) ? '\n' : ' '); } dbgout() << "\n"; #endif return; } } //dbgout() << "[i825x] TX Status = " << as_hex(td.upper.fields.status) << "\n"; REQUIRE(td.upper.fields.status == TXD_STAT_DD); td.upper.data = 0; // Mark ready for re-use REQUIRE(ioreg(reg::TDH0) == ioreg(reg::TDT0)); }
VOID MemUReadCachelines ( IN UINT8 Buffer[], IN UINT32 Address, IN UINT16 ClCount ) { UINTN Index; UINT32 *Dest; for (Index = 0; Index < ClCount * 16; Index++) { Dest = (void *) &Buffer [Index * 4]; *Dest = __readfsdword (Address + Index * 4); _mm_mfence (); } }
void f() { (void)_mm_getcsr(); // expected-warning{{implicitly declaring library function '_mm_getcsr'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_getcsr'}} _mm_setcsr(1); // expected-warning{{implicitly declaring library function '_mm_setcsr'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_setcsr'}} _mm_sfence(); // expected-warning{{implicitly declaring library function '_mm_sfence'}} \ // expected-note{{include the header <xmmintrin.h> or explicitly provide a declaration for '_mm_sfence'}} _mm_clflush((void*)0); // expected-warning{{implicitly declaring library function '_mm_clflush'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_clflush'}} _mm_lfence(); // expected-warning{{implicitly declaring library function '_mm_lfence'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_lfence'}} _mm_mfence(); // expected-warning{{implicitly declaring library function '_mm_mfence'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_mfence'}} _mm_pause(); // expected-warning{{implicitly declaring library function '_mm_pause'}} \ // expected-note{{include the header <emmintrin.h> or explicitly provide a declaration for '_mm_pause'}} }
//---------------------------------------------------------------------------- // MemUFlushPattern: // // Flush a pattern of 72 bit times (per DQ) from cache. This procedure is used // to ensure cache miss on the next read training. // // In: Address - Physical address to be flushed // ClCount - number of cachelines to be flushed //FUNC_ATTRIBUTE(noinline) VOID MemUFlushPattern ( IN UINT32 Address, IN UINT16 ClCount ) { UINTN Index; // ssd - theory: a tlb flush is needed to avoid problems with clflush __writemsr (0x20F, __readmsr (0x20F)); for (Index = 0; Index < ClCount; Index++) { // mfence prevents speculative execution of the clflush _mm_mfence (); _mm_clflush_fs ((void *) (size_t) (Address + Index * 64)); } }
KOKKOS_FORCEINLINE_FUNCTION void memory_fence() { #if defined( KOKKOS_ATOMICS_USE_CUDA ) __threadfence(); #elif defined( KOKKOS_ATOMICS_USE_GCC ) || \ ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) ) __sync_synchronize(); #elif defined( KOKKOS_ATOMICS_USE_INTEL ) _mm_mfence(); #elif defined( KOKKOS_ATOMICS_USE_OMP31 ) #pragma omp flush #else #error "Error: memory_fence() not defined" #endif }
inline void operator()(const tbb::blocked_range<size_t> &range) const { for (size_t colIndex = range.begin(); colIndex < range.end(); ++colIndex) { double *__restrict__ col = &A[colIndex * LDA]; int i = 0; for (; i < permSize - 8; i += 2) { //_m_prefetchw(&col[p[8].a]); //_m_prefetchw(&col[p[8].b]); swap(col[perm[i + 0].a], col[perm[i + 0].b]); swap(col[perm[i + 1].a], col[perm[i + 1].b]); } for (; i < permSize; ++i) { const int rowIndex = perm[i].a; const int otherRow = perm[i].b; swap(col[rowIndex], col[otherRow]); } } _mm_mfence(); }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any compute work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. void WorkOnCompute( SWR_CONTEXT *pContext, uint32_t workerId, uint32_t& curDrawBE) { uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) { return; } uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute == false) return; // check dependencies if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return; } SWR_ASSERT(pDC->pDispatch != nullptr); DispatchQueue& queue = *pDC->pDispatch; // Is there any work remaining? if (queue.getNumQueued() > 0) { void* pSpillFillBuffer = nullptr; void* pScratchSpace = nullptr; uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); queue.finishedWork(); } // Ensure all streaming writes are globally visible before moving onto the next draw _mm_mfence(); } } }
void Shuffle_SSSE3(void* dest, const void* source, size_t size, const u8 red, const u8 green, const u8 blue, const u8 alpha) { static const unsigned int PSD = 64; assert(source != NULL && dest != NULL); assert(red > -1 && red < 4 && green > -1 && green < 4 && blue > -1 && blue < 4 && alpha > -1 && alpha < 4 && "Invalid mask"); assert(size % STRIDE == 0); const __m128i* source128 = reinterpret_cast<const __m128i*>(source); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i reg0 = _mm_setzero_si128(); __m128i reg1 = _mm_setzero_si128(); __m128i reg2 = _mm_setzero_si128(); __m128i reg3 = _mm_setzero_si128(); const __m128i mask128 = _mm_set_epi8(alpha+12, blue+12, green+12, red+12, alpha+8, blue+8, green+8, red+8, alpha+4, blue+4, green+4, red+4, alpha, blue, green, red); for(size_t k = 0, length = size/STRIDE; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128 + PSD), _MM_HINT_NTA); // work on entire cacheline before next prefetch // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ reg0 = _mm_load_si128(source128++); reg1 = _mm_load_si128(source128++); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg0, mask128)); reg2 = _mm_load_si128(source128++); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg1, mask128)); reg3 = _mm_load_si128(source128++); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg2, mask128)); _mm_stream_si128(dest128++, _mm_shuffle_epi8(reg3, mask128)); } _mm_mfence(); //ensure last WC buffers get flushed to memory }
void SPUThread::dmaTransfer(U32 cmd, U32 eal, U32 lsa, U32 size) { if (cmd & (MFC_BARRIER_ENABLE | MFC_FENCE_ENABLE)) { #ifdef NUCLEUS_ARCH_X86 _mm_mfence(); #endif } const auto& memory = parent->memory; switch (cmd & ~(MFC_BARRIER_ENABLE | MFC_FENCE_ENABLE)) { case MFC_PUT_CMD: case MFC_PUTR_CMD: memcpy(memory->ptr(eal), memory->ptr(lsa), size); break; case MFC_GET_CMD: memcpy(memory->ptr(lsa), memory->ptr(eal), size); break; default: assert_true("Unexpected"); } }
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { if (!KNOB_SINGLE_THREADED) { // Inform threads to finish up std::unique_lock<std::mutex> lock(pContext->WaitLock); pPool->inThreadShutdown = true; _mm_mfence(); pContext->FifosNotEmpty.notify_all(); lock.unlock(); // Wait for threads to finish and destroy them for (uint32_t t = 0; t < pPool->numThreads; ++t) { pPool->threads[t]->join(); delete(pPool->threads[t]); } // Clean up data used by threads free(pPool->pThreadData); } }
int cellSyncMutexLock(mem_ptr_t<CellSyncMutex> mutex) { cellSync->Log("cellSyncMutexLock(mutex=0x%x)", mutex.GetAddr()); if (!mutex.IsGood()) { return CELL_SYNC_ERROR_NULL_POINTER; } if (mutex.GetAddr() % 4) { return CELL_SYNC_ERROR_ALIGN; } be_t<u16> old_order; while (true) { const u32 old_data = mutex->m_data(); CellSyncMutex new_mutex; new_mutex.m_data() = old_data; old_order = new_mutex.m_order; new_mutex.m_order++; if (InterlockedCompareExchange(&mutex->m_data(), new_mutex.m_data(), old_data) == old_data) break; } while (old_order != mutex->m_freed) { Sleep(1); if (Emu.IsStopped()) { ConLog.Warning("cellSyncMutexLock(mutex=0x%x) aborted", mutex.GetAddr()); break; } } _mm_mfence(); return CELL_OK; }
void f0() { signed char tmp_c; // unsigned char tmp_Uc; signed short tmp_s; #ifdef USE_ALL unsigned short tmp_Us; #endif signed int tmp_i; unsigned int tmp_Ui; signed long long tmp_LLi; unsigned long long tmp_ULLi; float tmp_f; double tmp_d; void* tmp_vp; const void* tmp_vCp; char* tmp_cp; const char* tmp_cCp; int* tmp_ip; float* tmp_fp; const float* tmp_fCp; double* tmp_dp; const double* tmp_dCp; long long* tmp_LLip; #define imm_i 32 #define imm_i_0_2 0 #define imm_i_0_4 3 #define imm_i_0_8 7 #define imm_i_0_16 15 // Check this. #define imm_i_0_256 0 V2i* tmp_V2ip; V1LLi* tmp_V1LLip; V2LLi* tmp_V2LLip; // 64-bit V8c tmp_V8c; V4s tmp_V4s; V2i tmp_V2i; V1LLi tmp_V1LLi; #ifdef USE_3DNOW V2f tmp_V2f; #endif // 128-bit V16c tmp_V16c; V8s tmp_V8s; V4i tmp_V4i; V2LLi tmp_V2LLi; V4f tmp_V4f; V2d tmp_V2d; V2d* tmp_V2dp; V4f* tmp_V4fp; const V2d* tmp_V2dCp; const V4f* tmp_V4fCp; // 256-bit V32c tmp_V32c; V4d tmp_V4d; V8f tmp_V8f; V4LLi tmp_V4LLi; V8i tmp_V8i; V4LLi* tmp_V4LLip; V4d* tmp_V4dp; V8f* tmp_V8fp; const V4d* tmp_V4dCp; const V8f* tmp_V8fCp; tmp_V2LLi = __builtin_ia32_undef128(); tmp_V4LLi = __builtin_ia32_undef256(); tmp_i = __builtin_ia32_comieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomieq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomilt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomile(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomigt(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomige(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_ucomineq(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_comisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_comisdneq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdeq(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdlt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdle(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdgt(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdge(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_ucomisdneq(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpps(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 0); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 1); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 2); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 3); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 4); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 5); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 6); tmp_V4f = __builtin_ia32_cmpss(tmp_V4f, tmp_V4f, 7); tmp_V4f = __builtin_ia32_minps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxps(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_minss(tmp_V4f, tmp_V4f); tmp_V4f = __builtin_ia32_maxss(tmp_V4f, tmp_V4f); tmp_V8c = __builtin_ia32_paddsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubsb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_paddusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_paddusw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_psubusb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_psubusw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhw(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_pmulhuw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pcmpeqb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpeqw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpeqd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pcmpgtb(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pcmpgtw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pcmpgtd(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_pmaxub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pmaxsw(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_pminub(tmp_V8c, tmp_V8c); tmp_V4s = __builtin_ia32_pminsw(tmp_V4s, tmp_V4s); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmppd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 0); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 1); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 2); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 3); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 4); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 5); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 6); tmp_V2d = __builtin_ia32_cmpsd(tmp_V2d, tmp_V2d, 7); tmp_V2d = __builtin_ia32_minpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxpd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_minsd(tmp_V2d, tmp_V2d); tmp_V2d = __builtin_ia32_maxsd(tmp_V2d, tmp_V2d); tmp_V16c = __builtin_ia32_paddsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubsb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_paddusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_paddusw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_psubusb128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_psubusw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pmaxub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pmaxsw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminub128(tmp_V16c, tmp_V16c); tmp_V8s = __builtin_ia32_pminsw128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s); tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_haddpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_hsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_hsubpd(tmp_V2d, tmp_V2d); tmp_V8s = __builtin_ia32_phaddw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phaddd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phaddd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phaddsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phaddsw(tmp_V4s, tmp_V4s); tmp_V8s = __builtin_ia32_phsubw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_phsubd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_phsubd(tmp_V2i, tmp_V2i); tmp_V8s = __builtin_ia32_phsubsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_phsubsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pmaddubsw128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pmaddubsw(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_pmulhrsw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_pmulhrsw(tmp_V4s, tmp_V4s); tmp_V16c = __builtin_ia32_pshufb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_pshufb(tmp_V8c, tmp_V8c); tmp_V16c = __builtin_ia32_psignb128(tmp_V16c, tmp_V16c); tmp_V8c = __builtin_ia32_psignb(tmp_V8c, tmp_V8c); tmp_V8s = __builtin_ia32_psignw128(tmp_V8s, tmp_V8s); tmp_V4s = __builtin_ia32_psignw(tmp_V4s, tmp_V4s); tmp_V4i = __builtin_ia32_psignd128(tmp_V4i, tmp_V4i); tmp_V2i = __builtin_ia32_psignd(tmp_V2i, tmp_V2i); tmp_V16c = __builtin_ia32_pabsb128(tmp_V16c); tmp_V8c = __builtin_ia32_pabsb(tmp_V8c); tmp_V8s = __builtin_ia32_pabsw128(tmp_V8s); tmp_V4s = __builtin_ia32_pabsw(tmp_V4s); tmp_V4i = __builtin_ia32_pabsd128(tmp_V4i); tmp_V2i = __builtin_ia32_pabsd(tmp_V2i); tmp_V4s = __builtin_ia32_psllw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_pslld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psllq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psrlw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrld(tmp_V2i, tmp_V1LLi); tmp_V1LLi = __builtin_ia32_psrlq(tmp_V1LLi, tmp_V1LLi); tmp_V4s = __builtin_ia32_psraw(tmp_V4s, tmp_V1LLi); tmp_V2i = __builtin_ia32_psrad(tmp_V2i, tmp_V1LLi); tmp_V2i = __builtin_ia32_pmaddwd(tmp_V4s, tmp_V4s); tmp_V8c = __builtin_ia32_packsswb(tmp_V4s, tmp_V4s); tmp_V4s = __builtin_ia32_packssdw(tmp_V2i, tmp_V2i); tmp_V8c = __builtin_ia32_packuswb(tmp_V4s, tmp_V4s); tmp_i = __builtin_ia32_vec_ext_v2si(tmp_V2i, 0); __builtin_ia32_incsspd(tmp_Ui); __builtin_ia32_incsspq(tmp_ULLi); tmp_Ui = __builtin_ia32_rdsspd(tmp_Ui); tmp_ULLi = __builtin_ia32_rdsspq(tmp_ULLi); __builtin_ia32_saveprevssp(); __builtin_ia32_rstorssp(tmp_vp); __builtin_ia32_wrssd(tmp_Ui, tmp_vp); __builtin_ia32_wrssq(tmp_ULLi, tmp_vp); __builtin_ia32_wrussd(tmp_Ui, tmp_vp); __builtin_ia32_wrussq(tmp_ULLi, tmp_vp); __builtin_ia32_setssbsy(); __builtin_ia32_clrssbsy(tmp_vp); (void) __builtin_ia32_ldmxcsr(tmp_Ui); (void) _mm_setcsr(tmp_Ui); tmp_Ui = __builtin_ia32_stmxcsr(); tmp_Ui = _mm_getcsr(); (void)__builtin_ia32_fxsave(tmp_vp); (void)__builtin_ia32_fxsave64(tmp_vp); (void)__builtin_ia32_fxrstor(tmp_vp); (void)__builtin_ia32_fxrstor64(tmp_vp); (void)__builtin_ia32_xsave(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsave64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstor64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaveopt64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xrstors64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsavec64(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi); (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi); (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui); (void) __builtin_ia32_clzero(tmp_vp); (void) __builtin_ia32_cldemote(tmp_vp); tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i); tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f); tmp_i = __builtin_ia32_cvtss2si(tmp_V4f); tmp_i = __builtin_ia32_cvttss2si(tmp_V4f); tmp_i = __builtin_ia32_rdtsc(); tmp_i = __rdtsc(); tmp_i = __builtin_ia32_rdtscp(&tmp_Ui); tmp_LLi = __builtin_ia32_rdpmc(tmp_i); __builtin_ia32_wbnoinvd(); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtss2si64(tmp_V4f); tmp_LLi = __builtin_ia32_cvttss2si64(tmp_V4f); #endif tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f); (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp); (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f); (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f); tmp_i = __builtin_ia32_movmskps(tmp_V4f); tmp_i = __builtin_ia32_pmovmskb(tmp_V8c); (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi); (void) __builtin_ia32_sfence(); (void) _mm_sfence(); tmp_V4s = __builtin_ia32_psadbw(tmp_V8c, tmp_V8c); tmp_V4f = __builtin_ia32_rcpps(tmp_V4f); tmp_V4f = __builtin_ia32_rcpss(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_rsqrtss(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f); tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f); (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp); tmp_i = __builtin_ia32_movmskpd(tmp_V2d); tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c); (void) __builtin_ia32_movnti(tmp_ip, tmp_i); #ifdef USE_64 (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi); #endif tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d); tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d); tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d); tmp_V4f = __builtin_ia32_cvtpd2ps(tmp_V2d); tmp_V4i = __builtin_ia32_cvttpd2dq(tmp_V2d); tmp_V2i = __builtin_ia32_cvttpd2pi(tmp_V2d); tmp_V2d = __builtin_ia32_cvtpi2pd(tmp_V2i); tmp_i = __builtin_ia32_cvtsd2si(tmp_V2d); tmp_i = __builtin_ia32_cvttsd2si(tmp_V2d); tmp_V4f = __builtin_ia32_cvtsd2ss(tmp_V4f, tmp_V2d); #ifdef USE_64 tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d); tmp_LLi = __builtin_ia32_cvttsd2si64(tmp_V2d); #endif tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f); tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f); (void) __builtin_ia32_clflush(tmp_vCp); (void) _mm_clflush(tmp_vCp); (void) __builtin_ia32_lfence(); (void) _mm_lfence(); (void) __builtin_ia32_mfence(); (void) _mm_mfence(); (void) __builtin_ia32_pause(); (void) _mm_pause(); tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i); tmp_V4s = __builtin_ia32_psrawi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psradi(tmp_V2i, tmp_i); tmp_V4s = __builtin_ia32_psrlwi(tmp_V4s, tmp_i); tmp_V2i = __builtin_ia32_psrldi(tmp_V2i, tmp_i); tmp_V1LLi = __builtin_ia32_psrlqi(tmp_V1LLi, tmp_i); tmp_V1LLi = __builtin_ia32_pmuludq(tmp_V2i, tmp_V2i); tmp_V2LLi = __builtin_ia32_pmuludq128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psraw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrad128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_psrlw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_psrld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psrlq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllw128(tmp_V8s, tmp_V8s); tmp_V4i = __builtin_ia32_pslld128(tmp_V4i, tmp_V4i); tmp_V2LLi = __builtin_ia32_psllq128(tmp_V2LLi, tmp_V2LLi); tmp_V8s = __builtin_ia32_psllwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_pslldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psllqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrlwi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psrldi128(tmp_V4i, tmp_i); tmp_V2LLi = __builtin_ia32_psrlqi128(tmp_V2LLi, tmp_i); tmp_V8s = __builtin_ia32_psrawi128(tmp_V8s, tmp_i); tmp_V4i = __builtin_ia32_psradi128(tmp_V4i, tmp_i); tmp_V8s = __builtin_ia32_pmaddwd128(tmp_V8s, tmp_V8s); (void) __builtin_ia32_monitor(tmp_vp, tmp_Ui, tmp_Ui); (void) __builtin_ia32_mwait(tmp_Ui, tmp_Ui); tmp_V16c = __builtin_ia32_lddqu(tmp_cCp); tmp_V2LLi = __builtin_ia32_palignr128(tmp_V2LLi, tmp_V2LLi, imm_i); tmp_V1LLi = __builtin_ia32_palignr(tmp_V1LLi, tmp_V1LLi, imm_i); #ifdef USE_SSE4 tmp_V16c = __builtin_ia32_pblendvb128(tmp_V16c, tmp_V16c, tmp_V16c); tmp_V2d = __builtin_ia32_blendvpd(tmp_V2d, tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_blendvps(tmp_V4f, tmp_V4f, tmp_V4f); tmp_V8s = __builtin_ia32_packusdw128(tmp_V4i, tmp_V4i); tmp_V16c = __builtin_ia32_pmaxsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pmaxsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pmaxud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pmaxuw128(tmp_V8s, tmp_V8s); tmp_V16c = __builtin_ia32_pminsb128(tmp_V16c, tmp_V16c); tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i); tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s); tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i); tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16); tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16); tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16); tmp_V2d = __builtin_ia32_roundpd(tmp_V2d, imm_i_0_16); tmp_V4f = __builtin_ia32_insertps128(tmp_V4f, tmp_V4f, imm_i_0_256); #endif tmp_V4d = __builtin_ia32_addsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_addsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_haddpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_hsubps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_hsubpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_haddps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_maxpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_maxps256(tmp_V8f, tmp_V8f); tmp_V4d = __builtin_ia32_minpd256(tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_minps256(tmp_V8f, tmp_V8f); tmp_V2d = __builtin_ia32_vpermilvarpd(tmp_V2d, tmp_V2LLi); tmp_V4f = __builtin_ia32_vpermilvarps(tmp_V4f, tmp_V4i); tmp_V4d = __builtin_ia32_vpermilvarpd256(tmp_V4d, tmp_V4LLi); tmp_V8f = __builtin_ia32_vpermilvarps256(tmp_V8f, tmp_V8i); tmp_V4d = __builtin_ia32_blendvpd256(tmp_V4d, tmp_V4d, tmp_V4d); tmp_V8f = __builtin_ia32_blendvps256(tmp_V8f, tmp_V8f, tmp_V8f); tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7); tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0); tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0); tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d); tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f); tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d); tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d); tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f); tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7); tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7); tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7); tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d); tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rsqrtps256(tmp_V8f); tmp_V8f = __builtin_ia32_rcpps256(tmp_V8f); tmp_V4d = __builtin_ia32_roundpd256(tmp_V4d, 0x1); tmp_V8f = __builtin_ia32_roundps256(tmp_V8f, 0x1); tmp_i = __builtin_ia32_vtestzpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestnzcpd(tmp_V2d, tmp_V2d); tmp_i = __builtin_ia32_vtestzps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestnzcps(tmp_V4f, tmp_V4f); tmp_i = __builtin_ia32_vtestzpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestnzcpd256(tmp_V4d, tmp_V4d); tmp_i = __builtin_ia32_vtestzps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_vtestnzcps256(tmp_V8f, tmp_V8f); tmp_i = __builtin_ia32_ptestz256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_ptestnzc256(tmp_V4LLi, tmp_V4LLi); tmp_i = __builtin_ia32_movmskpd256(tmp_V4d); tmp_i = __builtin_ia32_movmskps256(tmp_V8f); __builtin_ia32_vzeroall(); __builtin_ia32_vzeroupper(); tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp); tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi); tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i); tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi); tmp_V8f = __builtin_ia32_maskloadps256(tmp_V8fCp, tmp_V8i); __builtin_ia32_maskstorepd(tmp_V2dp, tmp_V2LLi, tmp_V2d); __builtin_ia32_maskstoreps(tmp_V4fp, tmp_V4i, tmp_V4f); __builtin_ia32_maskstorepd256(tmp_V4dp, tmp_V4LLi, tmp_V4d); __builtin_ia32_maskstoreps256(tmp_V8fp, tmp_V8i, tmp_V8f); #ifdef USE_3DNOW tmp_V8c = __builtin_ia32_pavgusb(tmp_V8c, tmp_V8c); tmp_V2i = __builtin_ia32_pf2id(tmp_V2f); tmp_V2f = __builtin_ia32_pfacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfadd(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpeq(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpge(tmp_V2f, tmp_V2f); tmp_V2i = __builtin_ia32_pfcmpgt(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmax(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmin(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfmul(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcp(tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrcpit2(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqrt(tmp_V2f); tmp_V2f = __builtin_ia32_pfrsqit1(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsub(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfsubr(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fd(tmp_V2i); tmp_V4s = __builtin_ia32_pmulhrw(tmp_V4s, tmp_V4s); tmp_V2i = __builtin_ia32_pf2iw(tmp_V2f); tmp_V2f = __builtin_ia32_pfnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pfpnacc(tmp_V2f, tmp_V2f); tmp_V2f = __builtin_ia32_pi2fw(tmp_V2i); tmp_V2f = __builtin_ia32_pswapdsf(tmp_V2f); tmp_V2i = __builtin_ia32_pswapdsi(tmp_V2i); tmp_V4i = __builtin_ia32_sha1rnds4(tmp_V4i, tmp_V4i, imm_i_0_4); tmp_V4i = __builtin_ia32_sha1nexte(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha1msg2(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256rnds2(tmp_V4i, tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg1(tmp_V4i, tmp_V4i); tmp_V4i = __builtin_ia32_sha256msg2(tmp_V4i, tmp_V4i); #endif }
error_code sys_lwmutex_lock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex, u64 timeout) { sysPrxForUser.trace("sys_lwmutex_lock(lwmutex=*0x%x, timeout=0x%llx)", lwmutex, timeout); if (g_cfg.core.hle_lwmutex) { return sys_mutex_lock(ppu, lwmutex->sleep_queue, timeout); } const be_t<u32> tid(ppu.id); // try to lock lightweight mutex const be_t<u32> old_owner = lwmutex->vars.owner.compare_and_swap(lwmutex_free, tid); if (old_owner == lwmutex_free) { // locking succeeded return CELL_OK; } if (old_owner == tid) { // recursive locking if ((lwmutex->attribute & SYS_SYNC_RECURSIVE) == 0) { // if not recursive return CELL_EDEADLK; } if (lwmutex->recursive_count == -1) { // if recursion limit reached return CELL_EKRESOURCE; } // recursive locking succeeded lwmutex->recursive_count++; _mm_mfence(); return CELL_OK; } if (old_owner == lwmutex_dead) { // invalid or deleted mutex return CELL_EINVAL; } for (u32 i = 0; i < 10; i++) { busy_wait(); if (lwmutex->vars.owner.load() == lwmutex_free) { if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid)) { // locking succeeded return CELL_OK; } } } // atomically increment waiter value using 64 bit op lwmutex->all_info++; if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid)) { // locking succeeded --lwmutex->all_info; return CELL_OK; } // lock using the syscall const error_code res = _sys_lwmutex_lock(ppu, lwmutex->sleep_queue, timeout); lwmutex->all_info--; if (res == CELL_OK) { // locking succeeded auto old = lwmutex->vars.owner.exchange(tid); if (old != lwmutex_reserved) { fmt::throw_exception("Locking failed (lwmutex=*0x%x, owner=0x%x)" HERE, lwmutex, old); } return CELL_OK; } if (res == CELL_EBUSY && lwmutex->attribute & SYS_SYNC_RETRY) { while (true) { for (u32 i = 0; i < 10; i++) { busy_wait(); if (lwmutex->vars.owner.load() == lwmutex_free) { if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid)) { return CELL_OK; } } } lwmutex->all_info++; if (lwmutex->vars.owner.compare_and_swap_test(lwmutex_free, tid)) { lwmutex->all_info--; return CELL_OK; } const u64 time0 = timeout ? get_system_time() : 0; const error_code res_ = _sys_lwmutex_lock(ppu, lwmutex->sleep_queue, timeout); if (res_ == CELL_OK) { lwmutex->vars.owner.release(tid); } else if (timeout && res_ != CELL_ETIMEDOUT) { const u64 time_diff = get_system_time() - time0; if (timeout <= time_diff) { lwmutex->all_info--; return not_an_error(CELL_ETIMEDOUT); } timeout -= time_diff; } lwmutex->all_info--; if (res_ != CELL_EBUSY) { return res_; } } } return res; }
error_code sys_lwmutex_trylock(ppu_thread& ppu, vm::ptr<sys_lwmutex_t> lwmutex) { sysPrxForUser.trace("sys_lwmutex_trylock(lwmutex=*0x%x)", lwmutex); if (g_cfg.core.hle_lwmutex) { return sys_mutex_trylock(ppu, lwmutex->sleep_queue); } const be_t<u32> tid(ppu.id); // try to lock lightweight mutex const be_t<u32> old_owner = lwmutex->vars.owner.compare_and_swap(lwmutex_free, tid); if (old_owner == lwmutex_free) { // locking succeeded return CELL_OK; } if (old_owner == tid) { // recursive locking if ((lwmutex->attribute & SYS_SYNC_RECURSIVE) == 0) { // if not recursive return CELL_EDEADLK; } if (lwmutex->recursive_count == -1) { // if recursion limit reached return CELL_EKRESOURCE; } // recursive locking succeeded lwmutex->recursive_count++; _mm_mfence(); return CELL_OK; } if (old_owner == lwmutex_dead) { // invalid or deleted mutex return CELL_EINVAL; } if (old_owner == lwmutex_reserved) { // should be locked by the syscall const error_code res = _sys_lwmutex_trylock(lwmutex->sleep_queue); if (res == CELL_OK) { // locking succeeded auto old = lwmutex->vars.owner.exchange(tid); if (old != lwmutex_reserved) { fmt::throw_exception("Locking failed (lwmutex=*0x%x, owner=0x%x)" HERE, lwmutex, old); } } return res; } // locked by another thread return not_an_error(CELL_EBUSY); }
bool spu_thread::write_reg(const u32 addr, const u32 value) { auto try_start = [this]() { if (status.atomic_op([](u32& status) { if (status & SPU_STATUS_RUNNING) { return false; } status = SPU_STATUS_RUNNING; return true; })) { state -= cpu_flag::stop; thread_ctrl::notify(static_cast<named_thread<spu_thread>&>(*this)); } }; const u32 offset = addr - RAW_SPU_BASE_ADDR - index * RAW_SPU_OFFSET - RAW_SPU_PROB_OFFSET; switch (offset) { case MFC_LSA_offs: { if (value >= 0x40000) { break; } g_tls_mfc[index].lsa = value; return true; } case MFC_EAH_offs: { g_tls_mfc[index].eah = value; return true; } case MFC_EAL_offs: { g_tls_mfc[index].eal = value; return true; } case MFC_Size_Tag_offs: { g_tls_mfc[index].tag = value & 0x1f; g_tls_mfc[index].size = (value >> 16) & 0x7fff; return true; } case MFC_Class_CMD_offs: { g_tls_mfc[index].cmd = MFC(value & 0xff); switch (value & 0xff) { case MFC_SNDSIG_CMD: case MFC_SNDSIGB_CMD: case MFC_SNDSIGF_CMD: { g_tls_mfc[index].size = 4; // Fallthrough } case MFC_PUT_CMD: case MFC_PUTB_CMD: case MFC_PUTF_CMD: case MFC_PUTS_CMD: case MFC_PUTBS_CMD: case MFC_PUTFS_CMD: case MFC_GET_CMD: case MFC_GETB_CMD: case MFC_GETF_CMD: case MFC_GETS_CMD: case MFC_GETBS_CMD: case MFC_GETFS_CMD: { if (g_tls_mfc[index].size) { // Perform transfer immediately do_dma_transfer(g_tls_mfc[index]); } // .cmd should be zero, which is equal to MFC_PPU_DMA_CMD_ENQUEUE_SUCCESSFUL g_tls_mfc[index] = {}; if (value & MFC_START_MASK) { try_start(); } return true; } case MFC_BARRIER_CMD: case MFC_EIEIO_CMD: case MFC_SYNC_CMD: { g_tls_mfc[index] = {}; _mm_mfence(); return true; } } break; } case Prxy_QueryType_offs: { // TODO // 0 - no query requested; cancel previous request // 1 - set (interrupt) status upon completion of any enabled tag groups // 2 - set (interrupt) status upon completion of all enabled tag groups if (value > 2) { break; } if (value) { int_ctrl[2].set(SPU_INT2_STAT_DMA_TAG_GROUP_COMPLETION_INT); // TODO } return true; } case Prxy_QueryMask_offs: { mfc_prxy_mask = value; return true; } case SPU_In_MBox_offs: { ch_in_mbox.push(*this, value); return true; } case SPU_RunCntl_offs: { if (value == SPU_RUNCNTL_RUN_REQUEST) { try_start(); } else if (value == SPU_RUNCNTL_STOP_REQUEST) { status &= ~SPU_STATUS_RUNNING; state += cpu_flag::stop; } else { break; } run_ctrl = value; return true; } case SPU_NPC_offs: { if ((value & 2) || value >= 0x40000) { break; } npc = value; return true; } case SPU_RdSigNotify1_offs: { push_snr(0, value); return true; } case SPU_RdSigNotify2_offs: { push_snr(1, value); return true; } } LOG_ERROR(SPU, "RawSPUThread[%d]: Write32(0x%x, value=0x%x): unknown/illegal offset (0x%x)", index, addr, value, offset); return false; }
void spu_interpreter::DSYNC(SPUThread& CPU, spu_opcode_t op) { _mm_mfence(); }
void CopyGPUFrame_SSE4_1(void *pSrc, void *pDest, void *pCacheBlock, UINT width, UINT height, UINT pitch) { #if QTAV_HAVE(SSE4_1) //assert(((intptr_t)pCacheBlock & 0x0f) == 0 && (dst_pitch & 0x0f) == 0); __m128i x0, x1, x2, x3; __m128i *pLoad; __m128i *pStore; __m128i *pCache; UINT x, y, yLoad, yStore; UINT rowsPerBlock; UINT width64; UINT extraPitch; rowsPerBlock = CACHED_BUFFER_SIZE / pitch; width64 = (width + 63) & ~0x03f; extraPitch = (pitch - width64) / 16; pLoad = (__m128i *)pSrc; pStore = (__m128i *)pDest; const bool src_unaligned = ((intptr_t)pSrc) & 0x0f; const bool dst_unaligned = ((intptr_t)pDest & 0x0f); //if (src_unaligned || dst_unaligned) // qDebug("===========unaligned: src %d, dst: %d, extraPitch: %d", src_unaligned, dst_unaligned, extraPitch); // COPY THROUGH 4KB CACHED BUFFER for (y = 0; y < height; y += rowsPerBlock) { // ROWS LEFT TO COPY AT END if (y + rowsPerBlock > height) rowsPerBlock = height - y; pCache = (__m128i *)pCacheBlock; _mm_mfence(); // LOAD ROWS OF PITCH WIDTH INTO CACHED BLOCK for (yLoad = 0; yLoad < rowsPerBlock; yLoad++) { // COPY A ROW, CACHE LINE AT A TIME for (x = 0; x < pitch; x +=64) { // movntdqa x0 = _mm_stream_load_si128(pLoad + 0); x1 = _mm_stream_load_si128(pLoad + 1); x2 = _mm_stream_load_si128(pLoad + 2); x3 = _mm_stream_load_si128(pLoad + 3); if (src_unaligned) { // movdqu _mm_storeu_si128(pCache +0, x0); _mm_storeu_si128(pCache +1, x1); _mm_storeu_si128(pCache +2, x2); _mm_storeu_si128(pCache +3, x3); } else { // movdqa _mm_store_si128(pCache +0, x0); _mm_store_si128(pCache +1, x1); _mm_store_si128(pCache +2, x2); _mm_store_si128(pCache +3, x3); } pCache += 4; pLoad += 4; } } _mm_mfence(); pCache = (__m128i *)pCacheBlock; // STORE ROWS OF FRAME WIDTH FROM CACHED BLOCK for (yStore = 0; yStore < rowsPerBlock; yStore++) { // copy a row, cache line at a time for (x = 0; x < width64; x += 64) { // movdqa x0 = _mm_load_si128(pCache); x1 = _mm_load_si128(pCache + 1); x2 = _mm_load_si128(pCache + 2); x3 = _mm_load_si128(pCache + 3); if (dst_unaligned) { // movdqu _mm_storeu_si128(pStore, x0); _mm_storeu_si128(pStore + 1, x1); _mm_storeu_si128(pStore + 2, x2); _mm_storeu_si128(pStore + 3, x3); } else { // movntdq _mm_stream_si128(pStore, x0); _mm_stream_si128(pStore + 1, x1); _mm_stream_si128(pStore + 2, x2); _mm_stream_si128(pStore + 3, x3); } pCache += 4; pStore += 4; } pCache += extraPitch; pStore += extraPitch; } } #else Q_UNUSED(pSrc); Q_UNUSED(pDest); Q_UNUSED(pCacheBlock); Q_UNUSED(width); Q_UNUSED(height); Q_UNUSED(pitch); #endif //QTAV_HAVE(SSE4_1) }