Example #1
0
int
last_one01( unsigned int u0, unsigned int u1 )
{
    unsigned long index;

    if ( _BitScanForward( &index, u1 ) ) {
        return 53 - index;
    }
    _BitScanForward( &index, u0 );
    return 26 - index;
}
Example #2
0
int
last_one12( unsigned int u1, unsigned u2 )
{
    unsigned long index;

    if ( _BitScanForward( &index, u2 ) ) {
        return 80 - index;
    }
    _BitScanForward( &index, u1 );
    return 53 - index;
}
Example #3
0
int
last_one210( unsigned int u2, unsigned int u1, unsigned int u0 )
{
    unsigned long index;

    if ( _BitScanForward( &index, u2 ) ) {
        return 80 - index;
    }
    if ( _BitScanForward( &index, u1 ) ) {
        return 53 - index;
    }
    _BitScanForward( &index, u0 );
    return 26 - index;
}
Example #4
0
static int __inline rd_ctz(u32 x) {
	int r = 0;
	if (_BitScanForward(&r, x))
		return r;
	else
		return 32;
}
Example #5
0
File: lz4.c Project: BobWay/rippled
/********************************
   Common functions
********************************/
static unsigned LZ4_NbCommonBytes (register size_t val)
{
    if (LZ4_isLittleEndian())
    {
        if (LZ4_64bits())
        {
#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
            unsigned long r = 0;
            _BitScanForward64( &r, (U64)val );
            return (int)(r>>3);
#       elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
            return (__builtin_ctzll((U64)val) >> 3);
#       else
            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
#       endif
        }
        else /* 32 bits */
        {
#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
            unsigned long r;
            _BitScanForward( &r, (U32)val );
            return (int)(r>>3);
#       elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
            return (__builtin_ctz((U32)val) >> 3);
#       else
            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
#       endif
        }
    }
Example #6
0
inline unsigned int CountTrailingZeros(unsigned int elem)
{
	unsigned long out;
	if ( _BitScanForward(&out, elem) )
		return out;
	return 32;
}
Example #7
0
	int count_trailing_ones_hw(span<std::uint32_t const> buf)
	{
		auto const num = int(buf.size());
		std::uint32_t const* ptr = buf.data();

		TORRENT_ASSERT(num >= 0);
		TORRENT_ASSERT(ptr != nullptr);

		for (int i = num - 1; i >= 0; i--)
		{
			if (ptr[i] == 0xffffffff) continue;

#if TORRENT_HAS_BUILTIN_CTZ
			std::uint32_t const v = ~aux::network_to_host(ptr[i]);
			return (num - i - 1) * 32 + __builtin_ctz(v);
#elif defined _MSC_VER
			std::uint32_t const v = ~aux::network_to_host(ptr[i]);
			DWORD pos;
			_BitScanForward(&pos, v);
			return (num - i - 1) * 32 + pos;
#else
			TORRENT_ASSERT_FAIL();
			return -1;
#endif
		}

		return num * 32;
	}
Example #8
0
Size findFirstBit(Size a) {
#ifdef __GNUC__
#ifdef __X64__
    return __builtin_ctzl(a);
#else
    return __builtin_ctz(a);
#endif
#elif defined(_MSC_VER)
    unsigned long pos;
#ifdef __X64__
	_BitScanForward64(&pos, a);
#else
    _BitScanForward(&pos, a);
#endif
	return pos;
#else
	//Very naive implementation.
	Size c = 0;
	while(!(a & 1)) {
		a >>= 1;
		c++;
	}
	return c;
#endif
}
Example #9
0
/*-********************************************************
*  Dictionary training functions
**********************************************************/
static unsigned ZDICT_NbCommonBytes (register size_t val)
{
    if (MEM_isLittleEndian()) {
        if (MEM_64bits()) {
#       if defined(_MSC_VER) && defined(_WIN64)
            unsigned long r = 0;
            _BitScanForward64( &r, (U64)val );
            return (unsigned)(r>>3);
#       elif defined(__GNUC__) && (__GNUC__ >= 3)
            return (__builtin_ctzll((U64)val) >> 3);
#       else
            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
#       endif
        } else { /* 32 bits */
#       if defined(_MSC_VER)
            unsigned long r=0;
            _BitScanForward( &r, (U32)val );
            return (unsigned)(r>>3);
#       elif defined(__GNUC__) && (__GNUC__ >= 3)
            return (__builtin_ctz((U32)val) >> 3);
#       else
            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
#       endif
        }
    } else {  /* Big Endian CPU */
Example #10
0
int
last_one1( unsigned int u1 )
{
    unsigned long index;

    _BitScanForward( &index, u1 );
    return 53 - index;
}
Example #11
0
int
last_one0( unsigned int u0 )
{
    unsigned long index;

    _BitScanForward( &index, u0 );
    return 26 - index;
}
Example #12
0
int ffsl(long value)
{
	unsigned long index = 0;
	unsigned char isNonZero;

	isNonZero = _BitScanForward(&index, value);
	return isNonZero ? index + 1 : 0;
}
Example #13
0
void ColorBuffer::GenerateMipMaps(CommandContext& BaseContext)
{
	if (m_NumMipMaps == 0)
		return;

	ComputeContext& Context = BaseContext.GetComputeContext();

	Context.SetRootSignature(Graphics::g_GenerateMipsRS);

	Context.TransitionResource(*this, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
	Context.SetDynamicDescriptor(1, 0, m_SRVHandle);

	for (uint32_t TopMip = 0; TopMip < m_NumMipMaps; )
	{
		uint32_t SrcWidth = m_Width >> TopMip;
		uint32_t SrcHeight = m_Height >> TopMip;
		uint32_t DstWidth = SrcWidth >> 1;
		uint32_t DstHeight = SrcHeight >> 1;

		// Determine if the first downsample is more than 2:1.  This happens whenever
		// the source width or height is odd.
		uint32_t NonPowerOfTwo = (SrcWidth & 1) | (SrcHeight & 1) << 1;
		if (m_Format == DXGI_FORMAT_R8G8B8A8_UNORM_SRGB)
			Context.SetPipelineState(Graphics::g_GenerateMipsGammaPSO[NonPowerOfTwo]);
		else
			Context.SetPipelineState(Graphics::g_GenerateMipsLinearPSO[NonPowerOfTwo]);

		// We can downsample up to four times, but if the ratio between levels is not
		// exactly 2:1, we have to shift our blend weights, which gets complicated or
		// expensive.  Maybe we can update the code later to compute sample weights for
		// each successive downsample.  We use _BitScanForward to count number of zeros
		// in the low bits.  Zeros indicate we can divide by two without truncating.
		uint32_t AdditionalMips;
		_BitScanForward((unsigned long*)&AdditionalMips, DstWidth | DstHeight);
		uint32_t NumMips = 1 + (AdditionalMips > 3 ? 3 : AdditionalMips);
		if (TopMip + NumMips > m_NumMipMaps)
			NumMips = m_NumMipMaps - TopMip;

		// These are clamped to 1 after computing additional mips because clamped
		// dimensions should not limit us from downsampling multiple times.  (E.g.
		// 16x1 -> 8x1 -> 4x1 -> 2x1 -> 1x1.)
		if (DstWidth == 0)
			DstWidth = 1;
		if (DstHeight == 0)
			DstHeight = 1;

		Context.SetConstants(0, TopMip, NumMips, 1.0f / DstWidth, 1.0f / DstHeight);
		Context.SetDynamicDescriptors(2, 0, NumMips, m_UAVHandle + TopMip + 1);
		Context.Dispatch2D(DstWidth, DstHeight);

		Context.InsertUAVBarrier(*this);

		TopMip += NumMips;
	}

	Context.TransitionResource(*this, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE |
		D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
}
Example #14
0
xint32 xi_arrays_bscan32(xint32 i) {
	DWORD idx = 0; // windows DWORD is always 32 bits.
	if (_BitScanForward(&idx, i)) {
		// windows index is from 0 (posix index from 1
		return idx+1;
	} else {
		return 0;
	}
}
Example #15
0
	void StateManager::SetTextureByMask(u32 textureSlotMask, ID3D11ShaderResourceView* srv)
	{
		while (textureSlotMask)
		{
			unsigned long index;
			_BitScanForward(&index, textureSlotMask);
			SetTexture(index, srv);
			textureSlotMask &= ~(1 << index);
		}
	}
Example #16
0
void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
{
    STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;

    unsigned long rt = 0;
    uint32_t mask = pDesc->attachmentMask;
    while (_BitScanForward(&rt, mask))
    {
        mask &= ~(1 << rt);
        ProcessStoreTileBE(pDC, workerId, macroTile, pDesc, (SWR_RENDERTARGET_ATTACHMENT)rt);
    }
}
Example #17
0
CPU_DATA  CPU_CntTrailZeros (CPU_DATA  val)
{
    DWORD  ctz;


    if (val == 0u) {
        return (32u);
    }

    _BitScanForward(&ctz, (DWORD)val);

    return ((CPU_DATA)ctz);
}
Example #18
0
    //////////////////////////////////////////////////////////////////////////
    // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
    //        packing the active mask bits
    //        ex. bitmask 0011 -> (0, 1, 0, 0)
    //            bitmask 1000 -> (3, 0, 0, 0)
    //            bitmask 1100 -> (2, 3, 0, 0)
    Value* PackMask(uint32_t bitmask)
    {
        std::vector<Constant*> indices(4, C(0));
        DWORD index;
        uint32_t elem = 0;
        while (_BitScanForward(&index, bitmask))
        {
            indices[elem++] = C((int)index);
            bitmask &= ~(1 << index);
        }

        return ConstantVector::get(indices);
    }
Example #19
0
  static inline int count_trailing_zeros(word_t word) {
#if defined(__GNUC__)
    return __builtin_ctzl(word);
#elif defined(_MSC_VER)
    unsigned long index;
#  if defined(_M_AMD64)
    assert(_BitScanForward64(&index, word) != 0);
#  else
    assert(_BitScanForward(&index, word) != 0);
#  endif
    return static_cast<int>(index);
#else
#endif
  }
Example #20
0
int32_t BitScanF(uint32_t i)
{
  DWORD result;
  _BitScanForward(&result,i);

  /*	_asm
	{
		xor		edx,edx
		bsf		eax,[i]
		setnz	dl
		dec		edx
		or		eax,edx
	}
*/
  return result;
}
Example #21
0
/// <summary>
/// Single step exception handler
/// </summary>
/// <param name="excpt">Exception information</param>
/// <returns>Exception disposition</returns>
LONG NTAPI DetourBase::StepHandler( PEXCEPTION_POINTERS excpt )
{
    DWORD index = 0;
    int found = _BitScanForward( &index, static_cast<DWORD>(excpt->ContextRecord->Dr6) );

    if (found != 0 && index < 4 && _breakpoints.count( excpt->ExceptionRecord->ExceptionAddress ))
    {
        DetourBase* pInst = _breakpoints[excpt->ExceptionRecord->ExceptionAddress];

        // Disable breakpoint at current index
        BitTestAndResetT( (LONG_PTR*)&excpt->ContextRecord->Dr7, 2 * index );

        ((_NT_TIB*)NtCurrentTeb())->ArbitraryUserPointer = (void*)pInst;
        excpt->ContextRecord->NIP = (uintptr_t)pInst->_internalHandler;

        return EXCEPTION_CONTINUE_EXECUTION;
    }
    return EXCEPTION_CONTINUE_SEARCH;
}
Example #22
0
//Return the number of trailing zeros. Deliberately undefined if value == 0
inline unsigned countTrailingUnsetBits(unsigned value)
{
    dbgassertex(value != 0);
#if defined(__GNUC__)
    return __builtin_ctz(value);
#elif defined (_WIN32)
    unsigned long index;
    _BitScanForward(&index, value);
    return (unsigned)index;
#else
    unsigned mask = 1U;
    unsigned i;
    for (i=0; i < sizeof(unsigned)*8; i++)
    {
        if (value & mask)
            return i;
        mask = mask << 1;
    }
    return i;
#endif
}
count_zeroes(size_t *x)
{
  int result;
#if defined(HAVE_BUILTIN_CTZL)
  result = __builtin_ctzl(*x);
  *x >>= result;
#elif defined(HAVE_BITSCANFORWARD64)
  _BitScanForward64(&result, *x);
  *x >>= result;
#elif defined(HAVE_BITSCANFORWARD)
  _BitScanForward(&result, *x);
  *x >>= result;
#else
  result = 0;
  while ((*x & 1) == 0) {
    ++result;
    *x >>= 1;
  }
#endif
  return result;
}
Example #24
0
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
{
    SWR_CONTEXT* pContext           = pDC->pContext;
    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    if (KNOB_FAST_CLEAR)
    {
        CLEAR_DESC*           pClear      = (CLEAR_DESC*)pUserData;
        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
        uint32_t              numSamples  = GetNumSamples(sampleCount);

        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.

        RDTSC_BEGIN(BEClear, pDC->drawId);

        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
        {
            unsigned long rt   = 0;
            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
            while (_BitScanForward(&rt, mask))
            {
                mask &= ~(1 << rt);

                HOTTILE* pHotTile =
                    pContext->pHotTileMgr->GetHotTile(pContext,
                                                      pDC,
                                                      hWorkerPrivateData,
                                                      macroTile,
                                                      (SWR_RENDERTARGET_ATTACHMENT)rt,
                                                      true,
                                                      numSamples,
                                                      pClear->renderTargetArrayIndex);

                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
                pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
                pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
                pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
                pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
                pHotTile->state        = HOTTILE_CLEAR;
            }
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
        {
            HOTTILE* pHotTile      = pContext->pHotTileMgr->GetHotTile(pContext,
                                                                  pDC,
                                                                  hWorkerPrivateData,
                                                                  macroTile,
                                                                  SWR_ATTACHMENT_DEPTH,
                                                                  true,
                                                                  numSamples,
                                                                  pClear->renderTargetArrayIndex);
            pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
            pHotTile->state        = HOTTILE_CLEAR;
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
        {
            HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
                                                                  pDC,
                                                                  hWorkerPrivateData,
                                                                  macroTile,
                                                                  SWR_ATTACHMENT_STENCIL,
                                                                  true,
                                                                  numSamples,
                                                                  pClear->renderTargetArrayIndex);

            pHotTile->clearData[0] = pClear->clearStencil;
            pHotTile->state        = HOTTILE_CLEAR;
        }

        RDTSC_END(BEClear, 1);
    }
    else
    {
        // Legacy clear
        CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
        RDTSC_BEGIN(BEClear, pDC->drawId);

        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
        {
            uint32_t clearData[4];
            clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
            clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
            clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
            clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);

            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
            SWR_ASSERT(pfnClearTiles != nullptr);

            unsigned long rt   = 0;
            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
            while (_BitScanForward(&rt, mask))
            {
                mask &= ~(1 << rt);

                pfnClearTiles(pDC,
                              hWorkerPrivateData,
                              (SWR_RENDERTARGET_ATTACHMENT)rt,
                              macroTile,
                              pClear->renderTargetArrayIndex,
                              clearData,
                              pClear->rect);
            }
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
        {
            uint32_t clearData[4];
            clearData[0]                  = *(uint32_t*)&pClear->clearDepth;
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
            SWR_ASSERT(pfnClearTiles != nullptr);

            pfnClearTiles(pDC,
                          hWorkerPrivateData,
                          SWR_ATTACHMENT_DEPTH,
                          macroTile,
                          pClear->renderTargetArrayIndex,
                          clearData,
                          pClear->rect);
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
        {
            uint32_t clearData[4];
            clearData[0]                  = pClear->clearStencil;
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];

            pfnClearTiles(pDC,
                          hWorkerPrivateData,
                          SWR_ATTACHMENT_STENCIL,
                          macroTile,
                          pClear->renderTargetArrayIndex,
                          clearData,
                          pClear->rect);
        }

        RDTSC_END(BEClear, 1);
    }
}
Example #25
0
inline bitcount_t trailingzeros(uint32_t v)
{
    unsigned long i;
    _BitScanForward(&i, v);
    return i;
}
Example #26
0
wchar_t * __cdecl wcsstr (
        const wchar_t * wcs1,
        const wchar_t * wcs2
        )
{
    const wchar_t *stmp1, *stmp2;
    __m128i zero, pattern, characters1, characters2;

    // An empty search string matches everything.
    if (0 == *wcs2)
        return (wchar_t *)wcs1;

    if (__isa_available > __ISA_AVAILABLE_SSE2)
    {
        wchar_t c;
        unsigned i;

        // Load XMM with first characters of wcs2.
        if (XMM_PAGE_SAFE(wcs2))
        {
            pattern = _mm_loadu_si128((__m128i*)wcs2);
        }
        else
        {
            pattern = _mm_xor_si128(pattern, pattern);
            c = *(stmp2 = wcs2);
            for (i = 0; i < XMM_CHARS; ++i)
            {
                pattern = _mm_srli_si128(pattern, sizeof(wchar_t));
                pattern = _mm_insert_epi16(pattern, c, (XMM_CHARS-1));
                if (0 != c) c = *++stmp2;
            }
        }

        for(;;)
        {
            // Check for partial match, if none step forward and continue.
            if (XMM_PAGE_SAFE(wcs1))
            {
                characters1 = _mm_loadu_si128((__m128i*)wcs1);
                // If no potential match or end found, try next XMMWORD.
                if (_mm_cmpistra(pattern, characters1, f_srch_sub))
                {
                    wcs1 += XMM_CHARS;
                    continue;
                }
                // If end found there was no match.
                else if (!_mm_cmpistrc(pattern, characters1, f_srch_sub))
                {
                    return NULL;
                }

                // Get position of potential match.
                wcs1 += _mm_cmpistri(pattern, characters1, f_srch_sub);
            }
            else
            {
              // If end of string found there was no match.
              if (0 == *wcs1)
              {
                  return NULL;
              }

              // If current character doesn't match first character
              // of search string try next character.
              if (*wcs1 != *wcs2)
              {
                  ++wcs1;
                  continue;
              }
            }

            // Potential match, compare to check for full match.
            stmp1 = wcs1;
            stmp2 = wcs2;
            for (;;)
            {
                // If next XMMWORD is page-safe for each string
                // do a XMMWORD comparison.
                if (XMM_PAGE_SAFE(stmp1) && XMM_PAGE_SAFE(stmp2))
                {
                    characters1 = _mm_loadu_si128((__m128i*)stmp1);
                    characters2 = _mm_loadu_si128((__m128i*)stmp2);

                    // If unequal then no match found.
                    if (!_mm_cmpistro(characters2, characters1, f_srch_sub))
                    {
                        break;
                    }

                    // If end of search string then match found.
                    else if (_mm_cmpistrs(characters2, characters1, f_srch_sub))
                    {
                        return (wchar_t *)wcs1;
                    }

                    stmp1 += XMM_CHARS;
                    stmp2 += XMM_CHARS;
                    continue;
                }

                // Compare next character.
                else
                {
                    // If end of search string then match found.
                    if (0 == *stmp2)
                    {
                        return (wchar_t *)wcs1;
                    }

                    // If unequal then no match found.
                    if (*stmp1 != *stmp2)
                    {
                        break;
                    }

                    // Character matched - try next character.
                    ++stmp1;
                    ++stmp2;
                }
            }

            // Match not found at current position, try next.
            ++wcs1;
        }
    }
    else if (__isa_available == __ISA_AVAILABLE_SSE2)
    {
        unsigned offset, mask;

        // Build search pattern and zero pattern. Search pattern is
        // XMMWORD with the initial character of the search string
        // in every position. Zero pattern has a zero termination
        // character in every position.

        pattern = _mm_cvtsi32_si128(wcs2[0]);
        pattern = _mm_shufflelo_epi16(pattern, 0);
        pattern = _mm_shuffle_epi32(pattern, 0);
        zero = _mm_xor_si128(zero, zero);

        // Main loop for searching wcs1.

        for (;;)
        {
            // If XMM check is safe advance wcs1 to the next
            // possible match or end.

            if (XMM_PAGE_SAFE(wcs1))
            {
                characters1 = _mm_loadu_si128((__m128i*)wcs1);
                characters2 = _mm_cmpeq_epi16(characters1, zero);
                characters1 = _mm_cmpeq_epi16(characters1, pattern);
                characters1 = _mm_or_si128(characters1, characters2);
                mask = _mm_movemask_epi8(characters1);

                // If no character match or end found try next XMMWORD.

                if (0 == mask)
                {
                    wcs1 += XMM_CHARS;
                    continue;
                }

                // Advance wcs1 pointer to next possible match or end.

                _BitScanForward(&offset, mask);
                wcs1 += (offset/sizeof(wchar_t));
            }

            // If at the end of wcs1, then no match found.

            if (0 == wcs1[0]) return NULL;

            // If a first-character match is found compare
            // strings to look for match.

            if (wcs2[0] == wcs1[0])
            {
                stmp1 = wcs1;
                stmp2 = wcs2;
                for (;;)
                {
                    // If aligned as specified advance to next
                    // possible difference or wcs2 end.

                    if (XMM_PAGE_SAFE(stmp2) && XMM_PAGE_SAFE(stmp1))
                    {
                        characters1 = _mm_loadu_si128((__m128i*)stmp1);
                        characters2 = _mm_loadu_si128((__m128i*)stmp2);
                        characters1 = _mm_cmpeq_epi16(characters1, characters2);
                        characters2 = _mm_cmpeq_epi16(characters2, zero);
                        characters1 = _mm_cmpeq_epi16(characters1, zero);
                        characters1 = _mm_or_si128(characters1, characters2);
                        mask = _mm_movemask_epi8(characters1);

                        // If mask is zero there is no difference and
                        // wcs2 does not end in this XMMWORD. Continue
                        // with next XMMWORD.

                        if (0 == mask)
                        {
                            stmp1 += XMM_CHARS;
                            stmp2 += XMM_CHARS;
                            continue;
                        }

                        // Advance string pointers to next significant
                        // character.

                        _BitScanForward(&offset, mask);
                        stmp1 += (offset/sizeof(wchar_t));
                        stmp2 += (offset/sizeof(wchar_t));
                    }

                    // If we've reached the end of wcs2 then a match
                    // has been found.

                    if (0 == stmp2[0]) return (wchar_t *)wcs1;

                    // If we've reached a difference then no match
                    // was found.

                    if (stmp1[0] != stmp2[0]) break;

                    // Otherwise advance to next character and try
                    // again.

                    ++stmp1;
                    ++stmp2;
                }
            }

            // Current character wasn't a match, try next character.

            ++wcs1;
        }
    }
    else
    {
        const wchar_t *cp = wcs1;
        const wchar_t *s1, *s2;

        while (*cp)
        {
            s1 = cp;
            s2 = wcs2;

            while ( *s1 && *s2 && !(*s1-*s2) )
                s1++, s2++;

            if (!*s2)
                return (wchar_t *) cp;

            cp++;
        }

        return NULL;
    }
}
void GenerateMipsHelper::GenerateMips(RenderDeviceD3D12Impl *pRenderDeviceD3D12, TextureViewD3D12Impl *pTexView, CommandContext& Ctx)
{
    auto &ComputeCtx = Ctx.AsComputeContext();
    ComputeCtx.SetRootSignature(m_pGenerateMipsRS);
    auto *pTexture = pTexView->GetTexture();
    auto *pTexD3D12 = ValidatedCast<TextureD3D12Impl>( pTexture );
    auto &TexDesc = pTexture->GetDesc();
    auto *pSRV = pTexture->GetDefaultView(TEXTURE_VIEW_SHADER_RESOURCE);
    auto *pSRVD3D12Impl = ValidatedCast<TextureViewD3D12Impl>(pSRV);
    auto SRVDescriptorHandle = pSRVD3D12Impl->GetCPUDescriptorHandle();

    Ctx.TransitionResource(pTexD3D12, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
    auto *pd3d12Device = pRenderDeviceD3D12->GetD3D12Device();

    for (uint32_t TopMip = 0; TopMip < TexDesc.MipLevels-1; )
    {
        uint32_t SrcWidth = TexDesc.Width >> TopMip;
        uint32_t SrcHeight = TexDesc.Height >> TopMip;
        uint32_t DstWidth = SrcWidth >> 1;
        uint32_t DstHeight = SrcHeight >> 1;

        // Determine if the first downsample is more than 2:1.  This happens whenever
        // the source width or height is odd.
        uint32_t NonPowerOfTwo = (SrcWidth & 1) | (SrcHeight & 1) << 1;
        if (TexDesc.Format == TEX_FORMAT_RGBA8_UNORM_SRGB)
            ComputeCtx.SetPipelineState(m_pGenerateMipsGammaPSO[NonPowerOfTwo]);
        else
            ComputeCtx.SetPipelineState(m_pGenerateMipsLinearPSO[NonPowerOfTwo]);

        // We can downsample up to four times, but if the ratio between levels is not
        // exactly 2:1, we have to shift our blend weights, which gets complicated or
        // expensive.  Maybe we can update the code later to compute sample weights for
        // each successive downsample.  We use _BitScanForward to count number of zeros
        // in the low bits.  Zeros indicate we can divide by two without truncating.
        uint32_t AdditionalMips;
        _BitScanForward((unsigned long*)&AdditionalMips, DstWidth | DstHeight);
        uint32_t NumMips = 1 + (AdditionalMips > 3 ? 3 : AdditionalMips);
        if (TopMip + NumMips > TexDesc.MipLevels-1)
            NumMips = TexDesc.MipLevels-1 - TopMip;

        // These are clamped to 1 after computing additional mips because clamped
        // dimensions should not limit us from downsampling multiple times.  (E.g.
        // 16x1 -> 8x1 -> 4x1 -> 2x1 -> 1x1.)
        if (DstWidth == 0)
            DstWidth = 1;
        if (DstHeight == 0)
            DstHeight = 1;

        D3D12_DESCRIPTOR_HEAP_TYPE HeapType = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
        auto DescriptorAlloc = Ctx.AllocateDynamicGPUVisibleDescriptor(HeapType, 5);
        CommandContext::ShaderDescriptorHeaps Heaps(DescriptorAlloc.GetDescriptorHeap());
        ComputeCtx.SetDescriptorHeaps(Heaps);
        Ctx.GetCommandList()->SetComputeRootDescriptorTable(1, DescriptorAlloc.GetGpuHandle(0));
        Ctx.GetCommandList()->SetComputeRootDescriptorTable(2, DescriptorAlloc.GetGpuHandle(1));
        struct RootCBData
        {
            Uint32 SrcMipLevel;	    // Texture level of source mip
            Uint32 NumMipLevels;	// Number of OutMips to write: [1, 4]
            float TexelSize[2];	    // 1.0 / OutMip1.Dimensions
        } CBData = {TopMip, NumMips, 1.0f / static_cast<float>(DstWidth), 1.0f / static_cast<float>(DstHeight)};
        Ctx.GetCommandList()->SetComputeRoot32BitConstants(0, 4, &CBData, 0);

        // TODO: Shouldn't we transition top mip to shader resource state?
        D3D12_CPU_DESCRIPTOR_HANDLE DstDescriptorRange = DescriptorAlloc.GetCpuHandle();
        UINT DstRangeSize = 1+NumMips;
        D3D12_CPU_DESCRIPTOR_HANDLE SrcDescriptorRanges[5] = {};
        SrcDescriptorRanges[0] = SRVDescriptorHandle;
        UINT SrcRangeSizes[5] = {1,1,1,1,1};
        for(Uint32 u=0; u < NumMips; ++u)
            SrcDescriptorRanges[1+u] = pTexD3D12->GetUAVDescriptorHandle(TopMip+u+1, 0);

        pd3d12Device->CopyDescriptors(1, &DstDescriptorRange, &DstRangeSize, 1+NumMips, SrcDescriptorRanges, SrcRangeSizes, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);

        ComputeCtx.Dispatch((DstWidth+7)/8, (DstHeight+7)/8);

        Ctx.InsertUAVBarrier(*pTexD3D12, *pTexD3D12);

        TopMip += NumMips;
    }
}
Example #28
0
void inline BSF( unsigned long* index, size_t& mask )
{
	_BitScanForward( index, mask );
}
Example #29
0
wchar_t * __cdecl wcschr (
        const wchar_t * str,
        wchar_t ch
        )
{
    if (__isa_available < __ISA_AVAILABLE_SSE2)
    {
        while (*str && *str != ch)
                str++;

        // If the character is a match return pointer, otherwise
        // it must be the terminating zero and return NULL.
        return (*str == ch) ? (wchar_t *)str : NULL;
    }
    else
    {
        __m128i match, characters, temp;
        unsigned mask;
        unsigned long offset;

        // Build match pattern with target character in every position.

        match = _mm_cvtsi32_si128(ch);
        match = _mm_shufflelo_epi16(match, 0);
        match = _mm_shuffle_epi32(match, 0);

        for (;;)
        {
            // If the next XMMWORD does not overlap a page boundary check
            // it for match of character or zero.

            if (XMM_PAGE_SAFE(str))
            {
                // Check for match with either the search or zero character.
                // There may be more than one match, but only the first is
                // significant.

                characters = _mm_loadu_si128((__m128i*)str);
                temp = _mm_xor_si128(temp, temp);
                temp = _mm_cmpeq_epi16(temp, characters);
                characters = _mm_cmpeq_epi16(characters, match);
                temp = _mm_or_si128(temp, characters);
                mask = _mm_movemask_epi8(temp);

                // If one or more matches was found, get the position of
                // the first one. If that character is the same as the
                // search character return the pointer to it, otherwise
                // it must be the terminating zero so return NULL.

                if (mask != 0)
                {
                    _BitScanForward(&offset, mask);
                    str = (wchar_t *)(offset + (intptr_t)str);
                    return (*str == ch) ? (wchar_t *)str : NULL;
                }

                // No match found in this XMMWORD so skip to next.

                str += XMM_CHARS;
            }
            else
            {
                // If it is not safe to check an entire XMMWORD, check
                // a single character and try again.

                if (*str == ch) return (wchar_t *)str;
                if (*str == 0) return NULL;

                // No match so skip to next characcter.

                ++str;
            }
        }
    }
}
Example #30
0
	void StateManager::Apply()
	{
		if (!m_blendStates.empty())
		{
			if (m_currentBlendState != m_blendStates.top().get())
			{
				m_currentBlendState = (ID3D11BlendState*)m_blendStates.top().get();
				D3D::context->OMSetBlendState(m_currentBlendState, nullptr, 0xFFFFFFFF);
			}
		}
		else ERROR_LOG(VIDEO, "Tried to apply without blend state!");

		if (!m_depthStates.empty())
		{
			if (m_currentDepthState != m_depthStates.top().get())
			{
				m_currentDepthState = (ID3D11DepthStencilState*)m_depthStates.top().get();
				D3D::context->OMSetDepthStencilState(m_currentDepthState, 0);
			}
		}
		else ERROR_LOG(VIDEO, "Tried to apply without depth state!");

		if (!m_rasterizerStates.empty())
		{
			if (m_currentRasterizerState != m_rasterizerStates.top().get())
			{
				m_currentRasterizerState = (ID3D11RasterizerState*)m_rasterizerStates.top().get();
				D3D::context->RSSetState(m_currentRasterizerState);
			}
		}
		else ERROR_LOG(VIDEO, "Tried to apply without rasterizer state!");

		if (!m_dirtyFlags)
		{
			return;
		}		

		if (m_dirtyFlags & DirtyFlag_Constants)
		{
			if (use_partial_buffer_update)
			{
				if (m_dirtyFlags & DirtyFlag_PixelConstants)
				{
					if (m_pending.pixelConstantsSize[0] == 0 && m_pending.pixelConstantsSize[1] == 0)
					{
						D3D::context->PSSetConstantBuffers(0,  m_pending.pixelConstants[1] ? 2 : 1, m_pending.pixelConstants);
					}
					else
					{
						D3D::context1->PSSetConstantBuffers1(0, 1, m_pending.pixelConstants, m_pending.pixelConstantsOffset, m_pending.pixelConstantsSize);
					}
					m_current.pixelConstants[0] = m_pending.pixelConstants[0];
					m_current.pixelConstantsOffset[0] = m_pending.pixelConstantsOffset[0];
					m_current.pixelConstantsSize[0] = m_pending.pixelConstantsSize[0];
					m_current.pixelConstants[1] = m_pending.pixelConstants[1];
					m_current.pixelConstantsOffset[1] = m_pending.pixelConstantsOffset[1];
					m_current.pixelConstantsSize[1] = m_pending.pixelConstantsSize[1];
				}
				if (m_dirtyFlags & DirtyFlag_VertexConstants)
				{
					if (m_pending.vertexConstantsSize == 0)
					{
						D3D::context1->VSSetConstantBuffers(0, 1, &m_pending.vertexConstants);
					}
					else
					{
						D3D::context1->VSSetConstantBuffers1(0, 1, &m_pending.vertexConstants, &m_pending.vertexConstantsOffset, &m_pending.vertexConstantsSize);
					}
					m_current.vertexConstants = m_pending.vertexConstants;
					m_current.vertexConstantsOffset = m_pending.vertexConstantsOffset;
					m_current.vertexConstantsSize = m_pending.vertexConstantsSize;
				}
				if (m_dirtyFlags & DirtyFlag_GeometryConstants)
				{
					if (m_pending.geometryConstantsSize == 0)
					{
						D3D::context->GSSetConstantBuffers(0, 1, &m_pending.geometryConstants);
					}
					else
					{
						D3D::context1->GSSetConstantBuffers1(0, 1, &m_pending.geometryConstants, &m_pending.geometryConstantsOffset, &m_pending.geometryConstantsSize);
					}
					m_current.geometryConstants = m_pending.geometryConstants;
					m_current.geometryConstantsOffset = m_pending.geometryConstantsOffset;
					m_current.geometryConstantsSize = m_pending.geometryConstantsSize;
				}
				if (m_dirtyFlags & DirtyFlag_HullDomainConstants)
				{
					if (m_pending.hulldomainConstantsSize == 0)
					{
						D3D::context->HSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants);
						D3D::context->DSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants);
					}
					else
					{
						D3D::context1->HSSetConstantBuffers1(0, 1, &m_pending.hulldomainConstants, &m_pending.hulldomainConstantsOffset, &m_pending.hulldomainConstantsSize);
						D3D::context1->DSSetConstantBuffers1(0, 1, &m_pending.hulldomainConstants, &m_pending.hulldomainConstantsOffset, &m_pending.hulldomainConstantsSize);
					}
					m_current.hulldomainConstants = m_pending.hulldomainConstants;
					m_current.hulldomainConstantsOffset = m_pending.hulldomainConstantsOffset;
					m_current.hulldomainConstantsSize = m_pending.hulldomainConstantsSize;
				}

			}
			else
			{
				if (m_dirtyFlags & DirtyFlag_PixelConstants)
				{
					D3D::context->PSSetConstantBuffers(0,  m_pending.pixelConstants[1] ? 2 : 1, m_pending.pixelConstants);
					m_current.pixelConstants[0] = m_pending.pixelConstants[0];
					m_current.pixelConstants[1] = m_pending.pixelConstants[1];
				}
				if (m_dirtyFlags & DirtyFlag_VertexConstants)
				{
					D3D::context->VSSetConstantBuffers(0, 1, &m_pending.vertexConstants);
					m_current.vertexConstants = m_pending.vertexConstants;
				}
				if (m_dirtyFlags & DirtyFlag_GeometryConstants)
				{
					D3D::context->GSSetConstantBuffers(0, 1, &m_pending.geometryConstants);
					m_current.geometryConstants = m_pending.geometryConstants;
				}
				if (m_dirtyFlags & DirtyFlag_HullDomainConstants)
				{
					if (g_ActiveConfig.backend_info.bSupportsTessellation)
					{
						D3D::context->HSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants);
						D3D::context->DSSetConstantBuffers(0, 1, &m_pending.hulldomainConstants);
					}
					m_current.hulldomainConstants = m_pending.hulldomainConstants;
				}
			}
		}

		if (m_dirtyFlags & (DirtyFlag_Buffers | DirtyFlag_InputAssembler))
		{
			if (m_dirtyFlags & DirtyFlag_VertexBuffer)
			{
				D3D::context->IASetVertexBuffers(0, 1, &m_pending.vertexBuffer, &m_pending.vertexBufferStride, &m_pending.vertexBufferOffset);
				m_current.vertexBuffer = m_pending.vertexBuffer;
				m_current.vertexBufferStride = m_pending.vertexBufferStride;
				m_current.vertexBufferOffset = m_pending.vertexBufferOffset;
			}

			if (m_dirtyFlags & DirtyFlag_IndexBuffer)
			{
				D3D::context->IASetIndexBuffer(m_pending.indexBuffer, DXGI_FORMAT_R16_UINT, 0);
				m_current.indexBuffer = m_pending.indexBuffer;
			}

			if (m_current.topology != m_pending.topology)
			{
				D3D::context->IASetPrimitiveTopology(m_pending.topology);
				m_current.topology = m_pending.topology;
			}

			if (m_current.inputLayout != m_pending.inputLayout)
			{
				D3D::context->IASetInputLayout(m_pending.inputLayout);
				m_current.inputLayout = m_pending.inputLayout;
			}
		}
		u32 dirty_elements = m_dirtyFlags & DirtyFlag_Textures;
		if (dirty_elements)
		{
			while (dirty_elements)
			{
				unsigned long index;
				_BitScanForward(&index, dirty_elements);
				D3D::context->PSSetShaderResources(index, 1, &m_pending.textures[index]);
				D3D::context->DSSetShaderResources(index, 1, &m_pending.textures[index]);
				m_current.textures[index] = m_pending.textures[index];
				dirty_elements &= ~(1 << index);
			}
		}
		dirty_elements = (m_dirtyFlags & DirtyFlag_Samplers) >> 8;
		if (dirty_elements)
		{
			while (dirty_elements)
			{
				unsigned long index;
				_BitScanForward(&index, dirty_elements);
				D3D::context->PSSetSamplers(index, 1, &m_pending.samplers[index]);
				D3D::context->DSSetSamplers(index, 1, &m_pending.samplers[index]);
				m_current.samplers[index] = m_pending.samplers[index];
				dirty_elements &= ~(1 << index);
			}
		}

		if (m_dirtyFlags & DirtyFlag_Shaders)
		{
			if (m_current.pixelShader != m_pending.pixelShader)
			{
				D3D::context->PSSetShader(m_pending.pixelShader, nullptr, 0);
				m_current.pixelShader = m_pending.pixelShader;
			}

			if (m_current.vertexShader != m_pending.vertexShader)
			{
				D3D::context->VSSetShader(m_pending.vertexShader, nullptr, 0);
				m_current.vertexShader = m_pending.vertexShader;
			}

			if (m_current.geometryShader != m_pending.geometryShader)
			{
				D3D::context->GSSetShader(m_pending.geometryShader, nullptr, 0);
				m_current.geometryShader = m_pending.geometryShader;
			}
			if (g_ActiveConfig.backend_info.bSupportsTessellation)
			{
				if (m_current.hullShader != m_pending.hullShader)
				{
					D3D::context->HSSetShader(m_pending.hullShader, nullptr, 0);
					m_current.hullShader = m_pending.hullShader;
				}

				if (m_current.domainShader != m_pending.domainShader)
				{
					D3D::context->DSSetShader(m_pending.domainShader, nullptr, 0);
					m_current.domainShader = m_pending.domainShader;
				}
			}
		}

		m_dirtyFlags = 0;
	}