int last_one01( unsigned int u0, unsigned int u1 ) { unsigned long index; if ( _BitScanForward( &index, u1 ) ) { return 53 - index; } _BitScanForward( &index, u0 ); return 26 - index; }
int last_one12( unsigned int u1, unsigned u2 ) { unsigned long index; if ( _BitScanForward( &index, u2 ) ) { return 80 - index; } _BitScanForward( &index, u1 ); return 53 - index; }
int last_one210( unsigned int u2, unsigned int u1, unsigned int u0 ) { unsigned long index; if ( _BitScanForward( &index, u2 ) ) { return 80 - index; } if ( _BitScanForward( &index, u1 ) ) { return 53 - index; } _BitScanForward( &index, u0 ); return 26 - index; }
static int __inline rd_ctz(u32 x) { int r = 0; if (_BitScanForward(&r, x)) return r; else return 32; }
/******************************** Common functions ********************************/ static unsigned LZ4_NbCommonBytes (register size_t val) { if (LZ4_isLittleEndian()) { if (LZ4_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward64( &r, (U64)val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else /* 32 bits */ { # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; _BitScanForward( &r, (U32)val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } }
inline unsigned int CountTrailingZeros(unsigned int elem) { unsigned long out; if ( _BitScanForward(&out, elem) ) return out; return 32; }
int count_trailing_ones_hw(span<std::uint32_t const> buf) { auto const num = int(buf.size()); std::uint32_t const* ptr = buf.data(); TORRENT_ASSERT(num >= 0); TORRENT_ASSERT(ptr != nullptr); for (int i = num - 1; i >= 0; i--) { if (ptr[i] == 0xffffffff) continue; #if TORRENT_HAS_BUILTIN_CTZ std::uint32_t const v = ~aux::network_to_host(ptr[i]); return (num - i - 1) * 32 + __builtin_ctz(v); #elif defined _MSC_VER std::uint32_t const v = ~aux::network_to_host(ptr[i]); DWORD pos; _BitScanForward(&pos, v); return (num - i - 1) * 32 + pos; #else TORRENT_ASSERT_FAIL(); return -1; #endif } return num * 32; }
Size findFirstBit(Size a) { #ifdef __GNUC__ #ifdef __X64__ return __builtin_ctzl(a); #else return __builtin_ctz(a); #endif #elif defined(_MSC_VER) unsigned long pos; #ifdef __X64__ _BitScanForward64(&pos, a); #else _BitScanForward(&pos, a); #endif return pos; #else //Very naive implementation. Size c = 0; while(!(a & 1)) { a >>= 1; c++; } return c; #endif }
/*-******************************************************** * Dictionary training functions **********************************************************/ static unsigned ZDICT_NbCommonBytes (register size_t val) { if (MEM_isLittleEndian()) { if (MEM_64bits()) { # if defined(_MSC_VER) && defined(_WIN64) unsigned long r = 0; _BitScanForward64( &r, (U64)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_ctzll((U64)val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif } else { /* 32 bits */ # if defined(_MSC_VER) unsigned long r=0; _BitScanForward( &r, (U32)val ); return (unsigned)(r>>3); # elif defined(__GNUC__) && (__GNUC__ >= 3) return (__builtin_ctz((U32)val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif } } else { /* Big Endian CPU */
int last_one1( unsigned int u1 ) { unsigned long index; _BitScanForward( &index, u1 ); return 53 - index; }
int last_one0( unsigned int u0 ) { unsigned long index; _BitScanForward( &index, u0 ); return 26 - index; }
int ffsl(long value) { unsigned long index = 0; unsigned char isNonZero; isNonZero = _BitScanForward(&index, value); return isNonZero ? index + 1 : 0; }
void ColorBuffer::GenerateMipMaps(CommandContext& BaseContext) { if (m_NumMipMaps == 0) return; ComputeContext& Context = BaseContext.GetComputeContext(); Context.SetRootSignature(Graphics::g_GenerateMipsRS); Context.TransitionResource(*this, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); Context.SetDynamicDescriptor(1, 0, m_SRVHandle); for (uint32_t TopMip = 0; TopMip < m_NumMipMaps; ) { uint32_t SrcWidth = m_Width >> TopMip; uint32_t SrcHeight = m_Height >> TopMip; uint32_t DstWidth = SrcWidth >> 1; uint32_t DstHeight = SrcHeight >> 1; // Determine if the first downsample is more than 2:1. This happens whenever // the source width or height is odd. uint32_t NonPowerOfTwo = (SrcWidth & 1) | (SrcHeight & 1) << 1; if (m_Format == DXGI_FORMAT_R8G8B8A8_UNORM_SRGB) Context.SetPipelineState(Graphics::g_GenerateMipsGammaPSO[NonPowerOfTwo]); else Context.SetPipelineState(Graphics::g_GenerateMipsLinearPSO[NonPowerOfTwo]); // We can downsample up to four times, but if the ratio between levels is not // exactly 2:1, we have to shift our blend weights, which gets complicated or // expensive. Maybe we can update the code later to compute sample weights for // each successive downsample. We use _BitScanForward to count number of zeros // in the low bits. Zeros indicate we can divide by two without truncating. uint32_t AdditionalMips; _BitScanForward((unsigned long*)&AdditionalMips, DstWidth | DstHeight); uint32_t NumMips = 1 + (AdditionalMips > 3 ? 3 : AdditionalMips); if (TopMip + NumMips > m_NumMipMaps) NumMips = m_NumMipMaps - TopMip; // These are clamped to 1 after computing additional mips because clamped // dimensions should not limit us from downsampling multiple times. (E.g. // 16x1 -> 8x1 -> 4x1 -> 2x1 -> 1x1.) if (DstWidth == 0) DstWidth = 1; if (DstHeight == 0) DstHeight = 1; Context.SetConstants(0, TopMip, NumMips, 1.0f / DstWidth, 1.0f / DstHeight); Context.SetDynamicDescriptors(2, 0, NumMips, m_UAVHandle + TopMip + 1); Context.Dispatch2D(DstWidth, DstHeight); Context.InsertUAVBarrier(*this); TopMip += NumMips; } Context.TransitionResource(*this, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE | D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); }
xint32 xi_arrays_bscan32(xint32 i) { DWORD idx = 0; // windows DWORD is always 32 bits. if (_BitScanForward(&idx, i)) { // windows index is from 0 (posix index from 1 return idx+1; } else { return 0; } }
void StateManager::SetTextureByMask(u32 textureSlotMask, ID3D11ShaderResourceView* srv) { while (textureSlotMask) { unsigned long index; _BitScanForward(&index, textureSlotMask); SetTexture(index, srv); textureSlotMask &= ~(1 << index); } }
void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) { STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData; unsigned long rt = 0; uint32_t mask = pDesc->attachmentMask; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt); } }
CPU_DATA CPU_CntTrailZeros (CPU_DATA val) { DWORD ctz; if (val == 0u) { return (32u); } _BitScanForward(&ctz, (DWORD)val); return ((CPU_DATA)ctz); }
////////////////////////////////////////////////////////////////////////// // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector, // packing the active mask bits // ex. bitmask 0011 -> (0, 1, 0, 0) // bitmask 1000 -> (3, 0, 0, 0) // bitmask 1100 -> (2, 3, 0, 0) Value* PackMask(uint32_t bitmask) { std::vector<Constant*> indices(4, C(0)); DWORD index; uint32_t elem = 0; while (_BitScanForward(&index, bitmask)) { indices[elem++] = C((int)index); bitmask &= ~(1 << index); } return ConstantVector::get(indices); }
static inline int count_trailing_zeros(word_t word) { #if defined(__GNUC__) return __builtin_ctzl(word); #elif defined(_MSC_VER) unsigned long index; # if defined(_M_AMD64) assert(_BitScanForward64(&index, word) != 0); # else assert(_BitScanForward(&index, word) != 0); # endif return static_cast<int>(index); #else #endif }
int32_t BitScanF(uint32_t i) { DWORD result; _BitScanForward(&result,i); /* _asm { xor edx,edx bsf eax,[i] setnz dl dec edx or eax,edx } */ return result; }
/// <summary> /// Single step exception handler /// </summary> /// <param name="excpt">Exception information</param> /// <returns>Exception disposition</returns> LONG NTAPI DetourBase::StepHandler( PEXCEPTION_POINTERS excpt ) { DWORD index = 0; int found = _BitScanForward( &index, static_cast<DWORD>(excpt->ContextRecord->Dr6) ); if (found != 0 && index < 4 && _breakpoints.count( excpt->ExceptionRecord->ExceptionAddress )) { DetourBase* pInst = _breakpoints[excpt->ExceptionRecord->ExceptionAddress]; // Disable breakpoint at current index BitTestAndResetT( (LONG_PTR*)&excpt->ContextRecord->Dr7, 2 * index ); ((_NT_TIB*)NtCurrentTeb())->ArbitraryUserPointer = (void*)pInst; excpt->ContextRecord->NIP = (uintptr_t)pInst->_internalHandler; return EXCEPTION_CONTINUE_EXECUTION; } return EXCEPTION_CONTINUE_SEARCH; }
//Return the number of trailing zeros. Deliberately undefined if value == 0 inline unsigned countTrailingUnsetBits(unsigned value) { dbgassertex(value != 0); #if defined(__GNUC__) return __builtin_ctz(value); #elif defined (_WIN32) unsigned long index; _BitScanForward(&index, value); return (unsigned)index; #else unsigned mask = 1U; unsigned i; for (i=0; i < sizeof(unsigned)*8; i++) { if (value & mask) return i; mask = mask << 1; } return i; #endif }
count_zeroes(size_t *x) { int result; #if defined(HAVE_BUILTIN_CTZL) result = __builtin_ctzl(*x); *x >>= result; #elif defined(HAVE_BITSCANFORWARD64) _BitScanForward64(&result, *x); *x >>= result; #elif defined(HAVE_BITSCANFORWARD) _BitScanForward(&result, *x); *x >>= result; #else result = 0; while ((*x & 1) == 0) { ++result; *x >>= 1; } #endif return result; }
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) { SWR_CONTEXT* pContext = pDC->pContext; HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; if (KNOB_FAST_CLEAR) { CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; uint32_t numSamples = GetNumSamples(sampleCount); SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. RDTSC_BEGIN(BEClear, pDC->drawId); if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) { unsigned long rt = 0; uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex); // All we want to do here is to mark the hot tile as being in a "needs clear" state. pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]); pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]); pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]); pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]); pHotTile->state = HOTTILE_CLEAR; } } if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) { HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex); pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth; pHotTile->state = HOTTILE_CLEAR; } if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) { HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex); pHotTile->clearData[0] = pClear->clearStencil; pHotTile->state = HOTTILE_CLEAR; } RDTSC_END(BEClear, 1); } else { // Legacy clear CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; RDTSC_BEGIN(BEClear, pDC->drawId); if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) { uint32_t clearData[4]; clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]); clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]); clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]); clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]); PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; SWR_ASSERT(pfnClearTiles != nullptr); unsigned long rt = 0; uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); pfnClearTiles(pDC, hWorkerPrivateData, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); } } if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) { uint32_t clearData[4]; clearData[0] = *(uint32_t*)&pClear->clearDepth; PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; SWR_ASSERT(pfnClearTiles != nullptr); pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); } if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) { uint32_t clearData[4]; clearData[0] = pClear->clearStencil; PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); } RDTSC_END(BEClear, 1); } }
inline bitcount_t trailingzeros(uint32_t v) { unsigned long i; _BitScanForward(&i, v); return i; }
wchar_t * __cdecl wcsstr ( const wchar_t * wcs1, const wchar_t * wcs2 ) { const wchar_t *stmp1, *stmp2; __m128i zero, pattern, characters1, characters2; // An empty search string matches everything. if (0 == *wcs2) return (wchar_t *)wcs1; if (__isa_available > __ISA_AVAILABLE_SSE2) { wchar_t c; unsigned i; // Load XMM with first characters of wcs2. if (XMM_PAGE_SAFE(wcs2)) { pattern = _mm_loadu_si128((__m128i*)wcs2); } else { pattern = _mm_xor_si128(pattern, pattern); c = *(stmp2 = wcs2); for (i = 0; i < XMM_CHARS; ++i) { pattern = _mm_srli_si128(pattern, sizeof(wchar_t)); pattern = _mm_insert_epi16(pattern, c, (XMM_CHARS-1)); if (0 != c) c = *++stmp2; } } for(;;) { // Check for partial match, if none step forward and continue. if (XMM_PAGE_SAFE(wcs1)) { characters1 = _mm_loadu_si128((__m128i*)wcs1); // If no potential match or end found, try next XMMWORD. if (_mm_cmpistra(pattern, characters1, f_srch_sub)) { wcs1 += XMM_CHARS; continue; } // If end found there was no match. else if (!_mm_cmpistrc(pattern, characters1, f_srch_sub)) { return NULL; } // Get position of potential match. wcs1 += _mm_cmpistri(pattern, characters1, f_srch_sub); } else { // If end of string found there was no match. if (0 == *wcs1) { return NULL; } // If current character doesn't match first character // of search string try next character. if (*wcs1 != *wcs2) { ++wcs1; continue; } } // Potential match, compare to check for full match. stmp1 = wcs1; stmp2 = wcs2; for (;;) { // If next XMMWORD is page-safe for each string // do a XMMWORD comparison. if (XMM_PAGE_SAFE(stmp1) && XMM_PAGE_SAFE(stmp2)) { characters1 = _mm_loadu_si128((__m128i*)stmp1); characters2 = _mm_loadu_si128((__m128i*)stmp2); // If unequal then no match found. if (!_mm_cmpistro(characters2, characters1, f_srch_sub)) { break; } // If end of search string then match found. else if (_mm_cmpistrs(characters2, characters1, f_srch_sub)) { return (wchar_t *)wcs1; } stmp1 += XMM_CHARS; stmp2 += XMM_CHARS; continue; } // Compare next character. else { // If end of search string then match found. if (0 == *stmp2) { return (wchar_t *)wcs1; } // If unequal then no match found. if (*stmp1 != *stmp2) { break; } // Character matched - try next character. ++stmp1; ++stmp2; } } // Match not found at current position, try next. ++wcs1; } } else if (__isa_available == __ISA_AVAILABLE_SSE2) { unsigned offset, mask; // Build search pattern and zero pattern. Search pattern is // XMMWORD with the initial character of the search string // in every position. Zero pattern has a zero termination // character in every position. pattern = _mm_cvtsi32_si128(wcs2[0]); pattern = _mm_shufflelo_epi16(pattern, 0); pattern = _mm_shuffle_epi32(pattern, 0); zero = _mm_xor_si128(zero, zero); // Main loop for searching wcs1. for (;;) { // If XMM check is safe advance wcs1 to the next // possible match or end. if (XMM_PAGE_SAFE(wcs1)) { characters1 = _mm_loadu_si128((__m128i*)wcs1); characters2 = _mm_cmpeq_epi16(characters1, zero); characters1 = _mm_cmpeq_epi16(characters1, pattern); characters1 = _mm_or_si128(characters1, characters2); mask = _mm_movemask_epi8(characters1); // If no character match or end found try next XMMWORD. if (0 == mask) { wcs1 += XMM_CHARS; continue; } // Advance wcs1 pointer to next possible match or end. _BitScanForward(&offset, mask); wcs1 += (offset/sizeof(wchar_t)); } // If at the end of wcs1, then no match found. if (0 == wcs1[0]) return NULL; // If a first-character match is found compare // strings to look for match. if (wcs2[0] == wcs1[0]) { stmp1 = wcs1; stmp2 = wcs2; for (;;) { // If aligned as specified advance to next // possible difference or wcs2 end. if (XMM_PAGE_SAFE(stmp2) && XMM_PAGE_SAFE(stmp1)) { characters1 = _mm_loadu_si128((__m128i*)stmp1); characters2 = _mm_loadu_si128((__m128i*)stmp2); characters1 = _mm_cmpeq_epi16(characters1, characters2); characters2 = _mm_cmpeq_epi16(characters2, zero); characters1 = _mm_cmpeq_epi16(characters1, zero); characters1 = _mm_or_si128(characters1, characters2); mask = _mm_movemask_epi8(characters1); // If mask is zero there is no difference and // wcs2 does not end in this XMMWORD. Continue // with next XMMWORD. if (0 == mask) { stmp1 += XMM_CHARS; stmp2 += XMM_CHARS; continue; } // Advance string pointers to next significant // character. _BitScanForward(&offset, mask); stmp1 += (offset/sizeof(wchar_t)); stmp2 += (offset/sizeof(wchar_t)); } // If we've reached the end of wcs2 then a match // has been found. if (0 == stmp2[0]) return (wchar_t *)wcs1; // If we've reached a difference then no match // was found. if (stmp1[0] != stmp2[0]) break; // Otherwise advance to next character and try // again. ++stmp1; ++stmp2; } } // Current character wasn't a match, try next character. ++wcs1; } } else { const wchar_t *cp = wcs1; const wchar_t *s1, *s2; while (*cp) { s1 = cp; s2 = wcs2; while ( *s1 && *s2 && !(*s1-*s2) ) s1++, s2++; if (!*s2) return (wchar_t *) cp; cp++; } return NULL; } }
void GenerateMipsHelper::GenerateMips(RenderDeviceD3D12Impl *pRenderDeviceD3D12, TextureViewD3D12Impl *pTexView, CommandContext& Ctx) { auto &ComputeCtx = Ctx.AsComputeContext(); ComputeCtx.SetRootSignature(m_pGenerateMipsRS); auto *pTexture = pTexView->GetTexture(); auto *pTexD3D12 = ValidatedCast<TextureD3D12Impl>( pTexture ); auto &TexDesc = pTexture->GetDesc(); auto *pSRV = pTexture->GetDefaultView(TEXTURE_VIEW_SHADER_RESOURCE); auto *pSRVD3D12Impl = ValidatedCast<TextureViewD3D12Impl>(pSRV); auto SRVDescriptorHandle = pSRVD3D12Impl->GetCPUDescriptorHandle(); Ctx.TransitionResource(pTexD3D12, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); auto *pd3d12Device = pRenderDeviceD3D12->GetD3D12Device(); for (uint32_t TopMip = 0; TopMip < TexDesc.MipLevels-1; ) { uint32_t SrcWidth = TexDesc.Width >> TopMip; uint32_t SrcHeight = TexDesc.Height >> TopMip; uint32_t DstWidth = SrcWidth >> 1; uint32_t DstHeight = SrcHeight >> 1; // Determine if the first downsample is more than 2:1. This happens whenever // the source width or height is odd. uint32_t NonPowerOfTwo = (SrcWidth & 1) | (SrcHeight & 1) << 1; if (TexDesc.Format == TEX_FORMAT_RGBA8_UNORM_SRGB) ComputeCtx.SetPipelineState(m_pGenerateMipsGammaPSO[NonPowerOfTwo]); else ComputeCtx.SetPipelineState(m_pGenerateMipsLinearPSO[NonPowerOfTwo]); // We can downsample up to four times, but if the ratio between levels is not // exactly 2:1, we have to shift our blend weights, which gets complicated or // expensive. Maybe we can update the code later to compute sample weights for // each successive downsample. We use _BitScanForward to count number of zeros // in the low bits. Zeros indicate we can divide by two without truncating. uint32_t AdditionalMips; _BitScanForward((unsigned long*)&AdditionalMips, DstWidth | DstHeight); uint32_t NumMips = 1 + (AdditionalMips > 3 ? 3 : AdditionalMips); if (TopMip + NumMips > TexDesc.MipLevels-1) NumMips = TexDesc.MipLevels-1 - TopMip; // These are clamped to 1 after computing additional mips because clamped // dimensions should not limit us from downsampling multiple times. (E.g. // 16x1 -> 8x1 -> 4x1 -> 2x1 -> 1x1.) if (DstWidth == 0) DstWidth = 1; if (DstHeight == 0) DstHeight = 1; D3D12_DESCRIPTOR_HEAP_TYPE HeapType = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; auto DescriptorAlloc = Ctx.AllocateDynamicGPUVisibleDescriptor(HeapType, 5); CommandContext::ShaderDescriptorHeaps Heaps(DescriptorAlloc.GetDescriptorHeap()); ComputeCtx.SetDescriptorHeaps(Heaps); Ctx.GetCommandList()->SetComputeRootDescriptorTable(1, DescriptorAlloc.GetGpuHandle(0)); Ctx.GetCommandList()->SetComputeRootDescriptorTable(2, DescriptorAlloc.GetGpuHandle(1)); struct RootCBData { Uint32 SrcMipLevel; // Texture level of source mip Uint32 NumMipLevels; // Number of OutMips to write: [1, 4] float TexelSize[2]; // 1.0 / OutMip1.Dimensions } CBData = {TopMip, NumMips, 1.0f / static_cast<float>(DstWidth), 1.0f / static_cast<float>(DstHeight)}; Ctx.GetCommandList()->SetComputeRoot32BitConstants(0, 4, &CBData, 0); // TODO: Shouldn't we transition top mip to shader resource state? D3D12_CPU_DESCRIPTOR_HANDLE DstDescriptorRange = DescriptorAlloc.GetCpuHandle(); UINT DstRangeSize = 1+NumMips; D3D12_CPU_DESCRIPTOR_HANDLE SrcDescriptorRanges[5] = {}; SrcDescriptorRanges[0] = SRVDescriptorHandle; UINT SrcRangeSizes[5] = {1,1,1,1,1}; for(Uint32 u=0; u < NumMips; ++u) SrcDescriptorRanges[1+u] = pTexD3D12->GetUAVDescriptorHandle(TopMip+u+1, 0); pd3d12Device->CopyDescriptors(1, &DstDescriptorRange, &DstRangeSize, 1+NumMips, SrcDescriptorRanges, SrcRangeSizes, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); ComputeCtx.Dispatch((DstWidth+7)/8, (DstHeight+7)/8); Ctx.InsertUAVBarrier(*pTexD3D12, *pTexD3D12); TopMip += NumMips; } }
void inline BSF( unsigned long* index, size_t& mask ) { _BitScanForward( index, mask ); }
wchar_t * __cdecl wcschr ( const wchar_t * str, wchar_t ch ) { if (__isa_available < __ISA_AVAILABLE_SSE2) { while (*str && *str != ch) str++; // If the character is a match return pointer, otherwise // it must be the terminating zero and return NULL. return (*str == ch) ? (wchar_t *)str : NULL; } else { __m128i match, characters, temp; unsigned mask; unsigned long offset; // Build match pattern with target character in every position. match = _mm_cvtsi32_si128(ch); match = _mm_shufflelo_epi16(match, 0); match = _mm_shuffle_epi32(match, 0); for (;;) { // If the next XMMWORD does not overlap a page boundary check // it for match of character or zero. if (XMM_PAGE_SAFE(str)) { // Check for match with either the search or zero character. // There may be more than one match, but only the first is // significant. characters = _mm_loadu_si128((__m128i*)str); temp = _mm_xor_si128(temp, temp); temp = _mm_cmpeq_epi16(temp, characters); characters = _mm_cmpeq_epi16(characters, match); temp = _mm_or_si128(temp, characters); mask = _mm_movemask_epi8(temp); // If one or more matches was found, get the position of // the first one. If that character is the same as the // search character return the pointer to it, otherwise // it must be the terminating zero so return NULL. if (mask != 0) { _BitScanForward(&offset, mask); str = (wchar_t *)(offset + (intptr_t)str); return (*str == ch) ? (wchar_t *)str : NULL; } // No match found in this XMMWORD so skip to next. str += XMM_CHARS; } else { // If it is not safe to check an entire XMMWORD, check // a single character and try again. if (*str == ch) return (wchar_t *)str; if (*str == 0) return NULL; // No match so skip to next characcter. ++str; } } } }
void StateManager::Apply() { if (!m_blendStates.empty()) { if (m_currentBlendState != m_blendStates.top().get()) { m_currentBlendState = (ID3D11BlendState*)m_blendStates.top().get(); D3D::context->OMSetBlendState(m_currentBlendState, nullptr, 0xFFFFFFFF); } } else ERROR_LOG(VIDEO, "Tried to apply without blend state!"); if (!m_depthStates.empty()) { if (m_currentDepthState != m_depthStates.top().get()) { m_currentDepthState = (ID3D11DepthStencilState*)m_depthStates.top().get(); D3D::context->OMSetDepthStencilState(m_currentDepthState, 0); } } else ERROR_LOG(VIDEO, "Tried to apply without depth state!"); if (!m_rasterizerStates.empty()) { if (m_currentRasterizerState != m_rasterizerStates.top().get()) { m_currentRasterizerState = (ID3D11RasterizerState*)m_rasterizerStates.top().get(); D3D::context->RSSetState(m_currentRasterizerState); } } else ERROR_LOG(VIDEO, "Tried to apply without rasterizer state!"); if (!m_dirtyFlags) { return; } if (m_dirtyFlags & DirtyFlag_Constants) { if (use_partial_buffer_update) { if (m_dirtyFlags & DirtyFlag_PixelConstants) { if (m_pending.pixelConstantsSize[0] == 0 && m_pending.pixelConstantsSize[1] == 0) { D3D::context->PSSetConstantBuffers(0, m_pending.pixelConstants[1] ? 2 : 1, m_pending.pixelConstants); } else { D3D::context1->PSSetConstantBuffers1(0, 1, m_pending.pixelConstants, m_pending.pixelConstantsOffset, m_pending.pixelConstantsSize); } m_current.pixelConstants[0] = m_pending.pixelConstants[0]; m_current.pixelConstantsOffset[0] = m_pending.pixelConstantsOffset[0]; m_current.pixelConstantsSize[0] = m_pending.pixelConstantsSize[0]; m_current.pixelConstants[1] = m_pending.pixelConstants[1]; m_current.pixelConstantsOffset[1] = m_pending.pixelConstantsOffset[1]; m_current.pixelConstantsSize[1] = m_pending.pixelConstantsSize[1]; } if (m_dirtyFlags & DirtyFlag_VertexConstants) { if (m_pending.vertexConstantsSize == 0) { D3D::context1->VSSetConstantBuffers(0, 1, &m_pending.vertexConstants); } else { D3D::context1->VSSetConstantBuffers1(0, 1, &m_pending.vertexConstants, &m_pending.vertexConstantsOffset, &m_pending.vertexConstantsSize); } m_current.vertexConstants = m_pending.vertexConstants; m_current.vertexConstantsOffset = m_pending.vertexConstantsOffset; m_current.vertexConstantsSize = m_pending.vertexConstantsSize; } if (m_dirtyFlags & DirtyFlag_GeometryConstants) { if (m_pending.geometryConstantsSize == 0) { D3D::context->GSSetConstantBuffers(0, 1, &m_pending.geometryConstants); } else { D3D::context1->GSSetConstantBuffers1(0, 1, &m_pending.geometryConstants, &m_pending.geometryConstantsOffset, &m_pending.geometryConstantsSize); } m_current.geometryConstants = m_pending.geometryConstants; m_current.geometryConstantsOffset = m_pending.geometryConstantsOffset; m_current.geometryConstantsSize = m_pending.geometryConstantsSize; } if (m_dirtyFlags & DirtyFlag_HullDomainConstants) { if (m_pending.hulldomainConstantsSize == 0) { D3D::context->HSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants); D3D::context->DSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants); } else { D3D::context1->HSSetConstantBuffers1(0, 1, &m_pending.hulldomainConstants, &m_pending.hulldomainConstantsOffset, &m_pending.hulldomainConstantsSize); D3D::context1->DSSetConstantBuffers1(0, 1, &m_pending.hulldomainConstants, &m_pending.hulldomainConstantsOffset, &m_pending.hulldomainConstantsSize); } m_current.hulldomainConstants = m_pending.hulldomainConstants; m_current.hulldomainConstantsOffset = m_pending.hulldomainConstantsOffset; m_current.hulldomainConstantsSize = m_pending.hulldomainConstantsSize; } } else { if (m_dirtyFlags & DirtyFlag_PixelConstants) { D3D::context->PSSetConstantBuffers(0, m_pending.pixelConstants[1] ? 2 : 1, m_pending.pixelConstants); m_current.pixelConstants[0] = m_pending.pixelConstants[0]; m_current.pixelConstants[1] = m_pending.pixelConstants[1]; } if (m_dirtyFlags & DirtyFlag_VertexConstants) { D3D::context->VSSetConstantBuffers(0, 1, &m_pending.vertexConstants); m_current.vertexConstants = m_pending.vertexConstants; } if (m_dirtyFlags & DirtyFlag_GeometryConstants) { D3D::context->GSSetConstantBuffers(0, 1, &m_pending.geometryConstants); m_current.geometryConstants = m_pending.geometryConstants; } if (m_dirtyFlags & DirtyFlag_HullDomainConstants) { if (g_ActiveConfig.backend_info.bSupportsTessellation) { D3D::context->HSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants); D3D::context->DSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants); } m_current.hulldomainConstants = m_pending.hulldomainConstants; } } } if (m_dirtyFlags & (DirtyFlag_Buffers | DirtyFlag_InputAssembler)) { if (m_dirtyFlags & DirtyFlag_VertexBuffer) { D3D::context->IASetVertexBuffers(0, 1, &m_pending.vertexBuffer, &m_pending.vertexBufferStride, &m_pending.vertexBufferOffset); m_current.vertexBuffer = m_pending.vertexBuffer; m_current.vertexBufferStride = m_pending.vertexBufferStride; m_current.vertexBufferOffset = m_pending.vertexBufferOffset; } if (m_dirtyFlags & DirtyFlag_IndexBuffer) { D3D::context->IASetIndexBuffer(m_pending.indexBuffer, DXGI_FORMAT_R16_UINT, 0); m_current.indexBuffer = m_pending.indexBuffer; } if (m_current.topology != m_pending.topology) { D3D::context->IASetPrimitiveTopology(m_pending.topology); m_current.topology = m_pending.topology; } if (m_current.inputLayout != m_pending.inputLayout) { D3D::context->IASetInputLayout(m_pending.inputLayout); m_current.inputLayout = m_pending.inputLayout; } } u32 dirty_elements = m_dirtyFlags & DirtyFlag_Textures; if (dirty_elements) { while (dirty_elements) { unsigned long index; _BitScanForward(&index, dirty_elements); D3D::context->PSSetShaderResources(index, 1, &m_pending.textures[index]); D3D::context->DSSetShaderResources(index, 1, &m_pending.textures[index]); m_current.textures[index] = m_pending.textures[index]; dirty_elements &= ~(1 << index); } } dirty_elements = (m_dirtyFlags & DirtyFlag_Samplers) >> 8; if (dirty_elements) { while (dirty_elements) { unsigned long index; _BitScanForward(&index, dirty_elements); D3D::context->PSSetSamplers(index, 1, &m_pending.samplers[index]); D3D::context->DSSetSamplers(index, 1, &m_pending.samplers[index]); m_current.samplers[index] = m_pending.samplers[index]; dirty_elements &= ~(1 << index); } } if (m_dirtyFlags & DirtyFlag_Shaders) { if (m_current.pixelShader != m_pending.pixelShader) { D3D::context->PSSetShader(m_pending.pixelShader, nullptr, 0); m_current.pixelShader = m_pending.pixelShader; } if (m_current.vertexShader != m_pending.vertexShader) { D3D::context->VSSetShader(m_pending.vertexShader, nullptr, 0); m_current.vertexShader = m_pending.vertexShader; } if (m_current.geometryShader != m_pending.geometryShader) { D3D::context->GSSetShader(m_pending.geometryShader, nullptr, 0); m_current.geometryShader = m_pending.geometryShader; } if (g_ActiveConfig.backend_info.bSupportsTessellation) { if (m_current.hullShader != m_pending.hullShader) { D3D::context->HSSetShader(m_pending.hullShader, nullptr, 0); m_current.hullShader = m_pending.hullShader; } if (m_current.domainShader != m_pending.domainShader) { D3D::context->DSSetShader(m_pending.domainShader, nullptr, 0); m_current.domainShader = m_pending.domainShader; } } } m_dirtyFlags = 0; }