Ejemplo n.º 1
0
DWORD workerThreadMain(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);

    {
        char threadName[64];
        sprintf_s(threadName,
#if defined(_WIN32)
                  "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
#else
                  // linux pthread name limited to 16 chars (including \0)
                  "w%03d-n%d-c%03d-t%d",
#endif
            workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
        SetCurrentThreadName(threadName);
    }

    RDTSC_INIT(threadId);

    // Only need offset numa index from base for correct masking
    uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    TileSet lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);

    auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };

    uint32_t curDrawBE = 0;
    uint32_t curDrawFE = 0;

    bool bShutdown = false;

    while (true)
    {
        if (bShutdown && !threadHasWork(curDrawBE))
        {
            break;
        }

        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
        {
            _mm_pause();
        }

        if (!threadHasWork(curDrawBE))
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (threadHasWork(curDrawBE))
            {
                lock.unlock();
                continue;
            }

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();
        }

        if (IsBEThread)
        {
            RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
            RDTSC_END(WorkerWorkOnFifoBE, 0);

            WorkOnCompute(pContext, workerId, curDrawBE);
        }

        if (IsFEThread)
        {
            WorkOnFifoFE(pContext, workerId, curDrawFE);

            if (!IsBEThread)
            {
                curDrawBE = curDrawFE;
            }
        }
    }

    return 0;
}
Ejemplo n.º 2
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any BE work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
///                      own set and each time it fails to lock a macrotile, because its already locked,
///                      then it will add that tile to the lockedTiles set. As a worker begins to work
///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
///                      still have work pending in a previous draw. Additionally, the lockedTiles is
///                      hueristic that can steer a worker back to the same macrotile that it had been
///                      working on in a previous draw.
/// @returns        true if worker thread should shutdown
bool WorkOnFifoBE(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint32_t &curDrawBE,
    TileSet& lockedTiles,
    uint32_t numaNode,
    uint32_t numaMask)
{
    bool bShutdown = false;

    // Find the first incomplete draw that has pending work. If no such draw is found then
    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
    uint32_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
    {
        return false;
    }

    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;

    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
    lockedTiles.clear();

    // Try to work on each draw in order of the available draws in flight.
    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
    //      working on those macrotiles that are known to be complete in the prior draw to
    //      maintain order. The locked tiles provides the history to ensures this.
    for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];

        if (pDC->isCompute) return false; // We don't look at compute work.

        // First wait for FE to be finished with this draw. This keeps threading model simple
        // but if there are lots of bubbles between draws then serializing FE and BE may
        // need to be revisited.
        if (!pDC->doneFE) return false;
        
        // If this draw is dependent on a previous draw then we need to bail.
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return false;
        }

        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
        auto &macroTiles = pDC->pTileMgr->getDirtyTiles();

        for (auto tile : macroTiles)
        {
            uint32_t tileID = tile->mId;

            // Only work on tiles for this numa node
            uint32_t x, y;
            pDC->pTileMgr->getTileIndices(tileID, x, y);
            if (((x ^ y) & numaMask) != numaNode)
            {
                continue;
            }

            if (!tile->getNumQueued())
            {
                continue;
            }

            // can only work on this draw if it's not in use by other threads
            if (lockedTiles.find(tileID) != lockedTiles.end())
            {
                continue;
            }

            if (tile->tryLock())
            {
                BE_WORK *pWork;

                RDTSC_BEGIN(WorkerFoundWork, pDC->drawId);

                uint32_t numWorkItems = tile->getNumQueued();
                SWR_ASSERT(numWorkItems);

                pWork = tile->peek();
                SWR_ASSERT(pWork);
                if (pWork->type == DRAW)
                {
                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
                }
                else if (pWork->type == SHUTDOWN)
                {
                    bShutdown = true;
                }

                while ((pWork = tile->peek()) != nullptr)
                {
                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                    tile->dequeue();
                }
                RDTSC_END(WorkerFoundWork, numWorkItems);

                _ReadWriteBarrier();

                pDC->pTileMgr->markTileComplete(tileID);

                // Optimization: If the draw is complete and we're the last one to have worked on it then
                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
                if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
                {
                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
                    curDrawBE++;
                    CompleteDrawContextInl(pContext, workerId, pDC);

                    lastRetiredDraw++;

                    lockedTiles.clear();
                    break;
                }

                if (bShutdown)
                {
                    break;
                }
            }
            else
            {
                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
                lockedTiles.insert(tileID);
            }
        }
    }

    return bShutdown;
}
Ejemplo n.º 3
0
//////////////////////////////////////////////////////////////////////////
/// @brief InitializeHotTiles
/// for draw calls, we initialize the active hot tiles and perform deferred
/// load on them if tile is in invalid state. we do this in the outer thread
/// loop instead of inside the draw routine itself mainly for performance,
/// to avoid unnecessary setup every triangle
/// @todo support deferred clear
/// @param pCreateInfo - pointer to creation info.
void HotTileMgr::InitializeHotTiles(SWR_CONTEXT*  pContext,
                                    DRAW_CONTEXT* pDC,
                                    uint32_t      workerId,
                                    uint32_t      macroID)
{
    const API_STATE& state    = GetApiState(pDC);
    HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);
    x *= KNOB_MACROTILE_X_DIM;
    y *= KNOB_MACROTILE_Y_DIM;

    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);

    // check RT if enabled
    unsigned long rtSlot                 = 0;
    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
    {
        HOTTILE* pHotTile =
            GetHotTile(pContext,
                       pDC,
                       hWorkerPrivateData,
                       macroID,
                       (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
                       true,
                       numSamples);

        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  KNOB_COLOR_HOT_TILE_FORMAT,
                                  (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
                                  x,
                                  y,
                                  pHotTile->renderTargetArrayIndex,
                                  pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // Clear the tile.
            ClearColorHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        colorHottileEnableMask &= ~(1 << rtSlot);
    }

    // check depth if enabled
    if (state.depthHottileEnable)
    {
        HOTTILE* pHotTile = GetHotTile(
            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  KNOB_DEPTH_HOT_TILE_FORMAT,
                                  SWR_ATTACHMENT_DEPTH,
                                  x,
                                  y,
                                  pHotTile->renderTargetArrayIndex,
                                  pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // Clear the tile.
            ClearDepthHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
    }

    // check stencil if enabled
    if (state.stencilHottileEnable)
    {
        HOTTILE* pHotTile = GetHotTile(
            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  KNOB_STENCIL_HOT_TILE_FORMAT,
                                  SWR_ATTACHMENT_STENCIL,
                                  x,
                                  y,
                                  pHotTile->renderTargetArrayIndex,
                                  pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // Clear the tile.
            ClearStencilHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
    }
}
Ejemplo n.º 4
0
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
{
    SWR_CONTEXT* pContext           = pDC->pContext;
    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    if (KNOB_FAST_CLEAR)
    {
        CLEAR_DESC*           pClear      = (CLEAR_DESC*)pUserData;
        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
        uint32_t              numSamples  = GetNumSamples(sampleCount);

        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.

        RDTSC_BEGIN(BEClear, pDC->drawId);

        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
        {
            unsigned long rt   = 0;
            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
            while (_BitScanForward(&rt, mask))
            {
                mask &= ~(1 << rt);

                HOTTILE* pHotTile =
                    pContext->pHotTileMgr->GetHotTile(pContext,
                                                      pDC,
                                                      hWorkerPrivateData,
                                                      macroTile,
                                                      (SWR_RENDERTARGET_ATTACHMENT)rt,
                                                      true,
                                                      numSamples,
                                                      pClear->renderTargetArrayIndex);

                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
                pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
                pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
                pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
                pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
                pHotTile->state        = HOTTILE_CLEAR;
            }
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
        {
            HOTTILE* pHotTile      = pContext->pHotTileMgr->GetHotTile(pContext,
                                                                  pDC,
                                                                  hWorkerPrivateData,
                                                                  macroTile,
                                                                  SWR_ATTACHMENT_DEPTH,
                                                                  true,
                                                                  numSamples,
                                                                  pClear->renderTargetArrayIndex);
            pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
            pHotTile->state        = HOTTILE_CLEAR;
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
        {
            HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
                                                                  pDC,
                                                                  hWorkerPrivateData,
                                                                  macroTile,
                                                                  SWR_ATTACHMENT_STENCIL,
                                                                  true,
                                                                  numSamples,
                                                                  pClear->renderTargetArrayIndex);

            pHotTile->clearData[0] = pClear->clearStencil;
            pHotTile->state        = HOTTILE_CLEAR;
        }

        RDTSC_END(BEClear, 1);
    }
    else
    {
        // Legacy clear
        CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
        RDTSC_BEGIN(BEClear, pDC->drawId);

        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
        {
            uint32_t clearData[4];
            clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
            clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
            clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
            clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);

            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
            SWR_ASSERT(pfnClearTiles != nullptr);

            unsigned long rt   = 0;
            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
            while (_BitScanForward(&rt, mask))
            {
                mask &= ~(1 << rt);

                pfnClearTiles(pDC,
                              hWorkerPrivateData,
                              (SWR_RENDERTARGET_ATTACHMENT)rt,
                              macroTile,
                              pClear->renderTargetArrayIndex,
                              clearData,
                              pClear->rect);
            }
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
        {
            uint32_t clearData[4];
            clearData[0]                  = *(uint32_t*)&pClear->clearDepth;
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
            SWR_ASSERT(pfnClearTiles != nullptr);

            pfnClearTiles(pDC,
                          hWorkerPrivateData,
                          SWR_ATTACHMENT_DEPTH,
                          macroTile,
                          pClear->renderTargetArrayIndex,
                          clearData,
                          pClear->rect);
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
        {
            uint32_t clearData[4];
            clearData[0]                  = pClear->clearStencil;
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];

            pfnClearTiles(pDC,
                          hWorkerPrivateData,
                          SWR_ATTACHMENT_STENCIL,
                          macroTile,
                          pClear->renderTargetArrayIndex,
                          clearData,
                          pClear->rect);
        }

        RDTSC_END(BEClear, 1);
    }
}
Ejemplo n.º 5
0
void BackendSingleSample(DRAW_CONTEXT*        pDC,
                         uint32_t             workerId,
                         uint32_t             x,
                         uint32_t             y,
                         SWR_TRIANGLE_DESC&   work,
                         RenderOutputBuffers& renderBuffers)
{
    RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
    RDTSC_BEGIN(BESetup, pDC->drawId);

    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    const API_STATE& state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    SWR_PS_CONTEXT             psContext;
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    SetupPixelShaderContext<T>(&psContext, samplePos, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(psContext.pColorBuffer,
                       &pDepthBuffer,
                       &pStencilBuffer,
                       state.colorHottileEnable,
                       renderBuffers);

    RDTSC_END(BESetup, 1);

    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));

    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
#if USE_8x2_TILE_BACKEND
            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
            simdmask coverageMask = work.coverageMask[0] & MASK;

            if (coverageMask)
            {
                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                {
                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
                                  "Unsupported depth hot tile format");

                    const simdscalar z =
                        _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));

                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                }

                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
                {
                    const uint64_t* pCoverageMask =
                        (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
                            ? &work.innerCoverageMask
                            : &work.coverageMask[0];

                    generateInputCoverage<T, T::InputCoverage>(
                        pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
                }

                RDTSC_BEGIN(BEBarycentric, pDC->drawId);

                CalcPixelBarycentrics(coeffs, psContext);

                CalcCentroid<T, true>(
                    &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);

                // interpolate and quantize z
                psContext.vZ = vplaneps(
                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                RDTSC_END(BEBarycentric, 1);

                // interpolate user clip distance if available
                if (state.backendState.clipDistanceMask)
                {
                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
                                                         work.pUserClipBuffer,
                                                         psContext.vI.center,
                                                         psContext.vJ.center);
                }

                simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
                simdscalar depthPassMask   = vCoverageMask;
                simdscalar stencilPassMask = vCoverageMask;

                // Early-Z?
                if (T::bCanEarlyZ)
                {
                    RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
                    depthPassMask = DepthStencilTest(&state,
                                                     work.triFlags.frontFacing,
                                                     work.triFlags.viewportIndex,
                                                     psContext.vZ,
                                                     pDepthBuffer,
                                                     vCoverageMask,
                                                     pStencilBuffer,
                                                     &stencilPassMask);
                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
                                                               _simd_movemask_ps(stencilPassMask),
                                                               _simd_movemask_ps(vCoverageMask)));
                    RDTSC_END(BEEarlyDepthTest, 0);

                    // early-exit if no pixels passed depth or earlyZ is forced on
                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                    {
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                          &state.depthStencilState,
                                          work.triFlags.frontFacing,
                                          psContext.vZ,
                                          pDepthBuffer,
                                          depthPassMask,
                                          vCoverageMask,
                                          pStencilBuffer,
                                          stencilPassMask);

                        if (!_simd_movemask_ps(depthPassMask))
                        {
                            goto Endtile;
                        }
                    }
                }

                psContext.sampleIndex = 0;
                psContext.activeMask  = _simd_castps_si(vCoverageMask);

                // execute pixel shader
                RDTSC_BEGIN(BEPixelShader, pDC->drawId);
                state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
                RDTSC_END(BEPixelShader, 0);

                // update stats
                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                AR_EVENT(PSStats(psContext.stats.numInstExecuted));

                vCoverageMask = _simd_castsi_ps(psContext.activeMask);

                // late-Z
                if (!T::bCanEarlyZ)
                {
                    RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
                    depthPassMask = DepthStencilTest(&state,
                                                     work.triFlags.frontFacing,
                                                     work.triFlags.viewportIndex,
                                                     psContext.vZ,
                                                     pDepthBuffer,
                                                     vCoverageMask,
                                                     pStencilBuffer,
                                                     &stencilPassMask);
                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
                                                              _simd_movemask_ps(stencilPassMask),
                                                              _simd_movemask_ps(vCoverageMask)));
                    RDTSC_END(BELateDepthTest, 0);

                    if (!_simd_movemask_ps(depthPassMask))
                    {
                        // need to call depth/stencil write for stencil write
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                          &state.depthStencilState,
                                          work.triFlags.frontFacing,
                                          psContext.vZ,
                                          pDepthBuffer,
                                          depthPassMask,
                                          vCoverageMask,
                                          pStencilBuffer,
                                          stencilPassMask);
                        goto Endtile;
                    }
                }
                else
                {
                    // for early z, consolidate discards from shader
                    // into depthPassMask
                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
                }

                uint32_t statMask  = _simd_movemask_ps(depthPassMask);
                uint32_t statCount = _mm_popcnt_u32(statMask);
                UPDATE_STAT_BE(DepthPassCount, statCount);

                // output merger
                RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
                OutputMerger8x2(pDC,
                                psContext,
                                psContext.pColorBuffer,
                                0,
                                &state.blendState,
                                state.pfnBlendFunc,
                                vCoverageMask,
                                depthPassMask,
                                state.psState.renderTargetMask,
                                useAlternateOffset,
                                workerId);
#else
                OutputMerger4x2(pDC,
                                psContext,
                                psContext.pColorBuffer,
                                0,
                                &state.blendState,
                                state.pfnBlendFunc,
                                vCoverageMask,
                                depthPassMask,
                                state.psState.renderTargetMask,
                                workerId,
                                workerId);
#endif

                // do final depth write after all pixel kills
                if (!state.psState.forceEarlyZ)
                {
                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                      &state.depthStencilState,
                                      work.triFlags.frontFacing,
                                      psContext.vZ,
                                      pDepthBuffer,
                                      depthPassMask,
                                      vCoverageMask,
                                      pStencilBuffer,
                                      stencilPassMask);
                }
                RDTSC_END(BEOutputMerger, 0);
            }

        Endtile:
            RDTSC_BEGIN(BEEndTile, pDC->drawId);

            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
            {
                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            }

#if USE_8x2_TILE_BACKEND
            if (useAlternateOffset)
            {
                DWORD    rt;
                uint32_t rtMask = state.colorHottileEnable;
                while (_BitScanForward(&rt, rtMask))
                {
                    rtMask &= ~(1 << rt);
                    psContext.pColorBuffer[rt] +=
                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                }
            }
#else
            DWORD rt;
            uint32_t rtMask = state.colorHottileEnable;
            while (_BitScanForward(&rt, rtMask))
            {
                rtMask &= ~(1 << rt);
                psContext.pColorBuffer[rt] +=
                    (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
            }
#endif
            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
            pStencilBuffer +=
                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;

            RDTSC_END(BEEndTile, 0);

            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
        }

        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    }