C++ (Cpp) AR_END示例

示例#1

0

显示文件

文件： backend.cpp 项目： chemecse/mesa

void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
    SWR_RENDERTARGET_ATTACHMENT attachment)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BEStoreTiles, pDC->drawId);

    SWR_FORMAT srcFormat;
    switch (attachment)
    {
    case SWR_ATTACHMENT_COLOR0:
    case SWR_ATTACHMENT_COLOR1:
    case SWR_ATTACHMENT_COLOR2:
    case SWR_ATTACHMENT_COLOR3:
    case SWR_ATTACHMENT_COLOR4:
    case SWR_ATTACHMENT_COLOR5:
    case SWR_ATTACHMENT_COLOR6:
    case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
    case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
    default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    }

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroTile, x, y);

    // Only need to store the hottile if it's been rendered to...
    HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
    if (pHotTile)
    {
        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
        if (pHotTile->state == HOTTILE_CLEAR)
        {
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
            SWR_ASSERT(pfnClearTiles != nullptr);

            pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
        }

        if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
        {
            int32_t destX = KNOB_MACROTILE_X_DIM * x;
            int32_t destY = KNOB_MACROTILE_Y_DIM * y;

            pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
                attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
        }
        

        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
        {
            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
            {
                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
            }
        }
    }
    AR_END(BEStoreTiles, 1);
}

示例#2

0

显示文件

文件： clip.cpp 项目： MIPS/external-mesa3d

void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
{
    SWR_CONTEXT *pContext = pDC->pContext;
    AR_BEGIN(FEClipPoints, pDC->drawId);
    Clipper<1> clipper(workerId, pDC);
    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
    AR_END(FEClipPoints, 1);
}

示例#3

0

显示文件

文件： backend.cpp 项目： chemecse/mesa

//////////////////////////////////////////////////////////////////////////
/// @brief Process compute work.
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BEDispatch, pDC->drawId);

    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
    SWR_ASSERT(pTaskData != nullptr);

    // Ensure spill fill memory has been allocated.
    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
    if (spillFillSize && pSpillFillBuffer == nullptr)
    {
        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
    }
    
    size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
    if (scratchSpaceSize && pScratchSpace == nullptr)
    {
        pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
    }

    const API_STATE& state = GetApiState(pDC);

    SWR_CS_CONTEXT csContext{ 0 };
    csContext.tileCounter = threadGroupId;
    csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
    csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
    csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
    csContext.pTGSM = pContext->ppScratch[workerId];
    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
    csContext.pScratchSpace = (uint8_t*)pScratchSpace;
    csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;

    state.pfnCsFunc(GetPrivateState(pDC), &csContext);

    UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);

    AR_END(BEDispatch, 1);
}

示例#4

0

显示文件

文件： threads.cpp 项目： Haifen/mesa

DWORD workerThreadMain(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 

    RDTSC_INIT(threadId);

    uint32_t numaNode = pThreadData->numaId;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    TileSet lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);

    auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };

    uint32_t curDrawBE = 0;
    uint32_t curDrawFE = 0;

    bool bShutdown = false;

    while (true)
    {
        if (bShutdown && !threadHasWork(curDrawBE))
        {
            break;
        }

        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
        {
            _mm_pause();
        }

        if (!threadHasWork(curDrawBE))
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (threadHasWork(curDrawBE))
            {
                lock.unlock();
                continue;
            }

            AR_BEGIN(WorkerWaitForThreadEvent, 0);

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();

            AR_END(WorkerWaitForThreadEvent, 0);
        }

        if (IsBEThread)
        {
            AR_BEGIN(WorkerWorkOnFifoBE, 0);
            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
            AR_END(WorkerWorkOnFifoBE, 0);

            WorkOnCompute(pContext, workerId, curDrawBE);
        }

        if (IsFEThread)
        {
            WorkOnFifoFE(pContext, workerId, curDrawFE);

            if (!IsBEThread)
            {
                curDrawBE = curDrawFE;
            }
        }
    }

    return 0;
}

示例#5

0

显示文件

文件： threads.cpp 项目： Haifen/mesa

//////////////////////////////////////////////////////////////////////////
/// @brief If there is any BE work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
///                      own set and each time it fails to lock a macrotile, because its already locked,
///                      then it will add that tile to the lockedTiles set. As a worker begins to work
///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
///                      still have work pending in a previous draw. Additionally, the lockedTiles is
///                      hueristic that can steer a worker back to the same macrotile that it had been
///                      working on in a previous draw.
/// @returns        true if worker thread should shutdown
bool WorkOnFifoBE(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint32_t &curDrawBE,
    TileSet& lockedTiles,
    uint32_t numaNode,
    uint32_t numaMask)
{
    bool bShutdown = false;

    // Find the first incomplete draw that has pending work. If no such draw is found then
    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
    uint32_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
    {
        return false;
    }

    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;

    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
    lockedTiles.clear();

    // Try to work on each draw in order of the available draws in flight.
    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
    //      working on those macrotiles that are known to be complete in the prior draw to
    //      maintain order. The locked tiles provides the history to ensures this.
    for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];

        if (pDC->isCompute) return false; // We don't look at compute work.

        // First wait for FE to be finished with this draw. This keeps threading model simple
        // but if there are lots of bubbles between draws then serializing FE and BE may
        // need to be revisited.
        if (!pDC->doneFE) return false;
        
        // If this draw is dependent on a previous draw then we need to bail.
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return false;
        }

        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
        auto &macroTiles = pDC->pTileMgr->getDirtyTiles();

        for (auto tile : macroTiles)
        {
            uint32_t tileID = tile->mId;

            // Only work on tiles for this numa node
            uint32_t x, y;
            pDC->pTileMgr->getTileIndices(tileID, x, y);
            if (((x ^ y) & numaMask) != numaNode)
            {
                continue;
            }

            if (!tile->getNumQueued())
            {
                continue;
            }

            // can only work on this draw if it's not in use by other threads
            if (lockedTiles.find(tileID) != lockedTiles.end())
            {
                continue;
            }

            if (tile->tryLock())
            {
                BE_WORK *pWork;

                AR_BEGIN(WorkerFoundWork, pDC->drawId);

                uint32_t numWorkItems = tile->getNumQueued();
                SWR_ASSERT(numWorkItems);

                pWork = tile->peek();
                SWR_ASSERT(pWork);
                if (pWork->type == DRAW)
                {
                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
                }
                else if (pWork->type == SHUTDOWN)
                {
                    bShutdown = true;
                }

                while ((pWork = tile->peek()) != nullptr)
                {
                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                    tile->dequeue();
                }
                AR_END(WorkerFoundWork, numWorkItems);

                _ReadWriteBarrier();

                pDC->pTileMgr->markTileComplete(tileID);

                // Optimization: If the draw is complete and we're the last one to have worked on it then
                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
                if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
                {
                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
                    curDrawBE++;
                    CompleteDrawContextInl(pContext, pDC);

                    lastRetiredDraw++;

                    lockedTiles.clear();
                    break;
                }

                if (bShutdown)
                {
                    break;
                }
            }
            else
            {
                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
                lockedTiles.insert(tileID);
            }
        }
    }

    return bShutdown;
}

示例#6

0

显示文件

文件： backend.cpp 项目： chemecse/mesa

void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BENullBackend, pDC->drawId);
    ///@todo: handle center multisample pattern
    AR_BEGIN(BESetup, pDC->drawId);

    const API_STATE &state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(NULL, &pDepthBuffer, &pStencilBuffer, 0, renderBuffers);

    SWR_PS_CONTEXT psContext;
    // skip SetupPixelShaderContext(&psContext, ...); // not needed here

    AR_END(BESetup, 0);

    simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
            // iterate over active samples
            unsigned long sample = 0;
            uint32_t sampleMask = state.blendState.sampleMask;
            while (_BitScanForward(&sample, sampleMask))
            {
                sampleMask &= ~(1 << sample);

                simdmask coverageMask = work.coverageMask[sample] & MASK;

                if (coverageMask)
                {
                    // offset depth/stencil buffers current sample
                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);

                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                    {
                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");

                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));

                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                    }

                    AR_BEGIN(BEBarycentric, pDC->drawId);

                    // calculate per sample positions
                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, samplePos.vX(sample));
                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, samplePos.vY(sample));

                    CalcSampleBarycentrics(coeffs, psContext);

                    // interpolate and quantize z
                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                    AR_END(BEBarycentric, 0);

                    // interpolate user clip distance if available
                    if (state.backendState.clipDistanceMask)
                    {
                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                    }

                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
                    simdscalar stencilPassMask = vCoverageMask;

                    AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                        psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                    AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                        pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                    AR_END(BEEarlyDepthTest, 0);

                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
                    uint32_t statCount = _mm_popcnt_u32(statMask);
                    UPDATE_STAT_BE(DepthPassCount, statCount);
                }

            Endtile:
                ATTR_UNUSED;
                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            }

            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;

            vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
        }

        vYSamplePosUL = _simd_add_ps(vYSamplePosUL, dy);
    }

    AR_END(BENullBackend, 0);
}

示例#7

0

显示文件

文件： backend_sample.cpp 项目： chemecse/mesa

void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BESampleRateBackend, pDC->drawId);
    AR_BEGIN(BESetup, pDC->drawId);

    const API_STATE &state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    SWR_PS_CONTEXT psContext;
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    SetupPixelShaderContext<T>(&psContext, samplePos, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);

    AR_END(BESetup, 0);

    psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));

    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
#if USE_8x2_TILE_BACKEND
            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
            {
                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];

                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
            }

            AR_BEGIN(BEBarycentric, pDC->drawId);

            CalcPixelBarycentrics(coeffs, psContext);

            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);

            AR_END(BEBarycentric, 0);

            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
            {
                simdmask coverageMask = work.coverageMask[sample] & MASK;

                if (coverageMask)
                {
                    // offset depth/stencil buffers current sample
                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);

                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                    {
                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");

                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));

                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                    }

                    AR_BEGIN(BEBarycentric, pDC->drawId);

                    // calculate per sample positions
                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));

                    CalcSampleBarycentrics(coeffs, psContext);

                    // interpolate and quantize z
                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                    AR_END(BEBarycentric, 0);

                    // interpolate user clip distance if available
                    if (state.backendState.clipDistanceMask)
                    {
                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                    }

                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
                    simdscalar depthPassMask = vCoverageMask;
                    simdscalar stencilPassMask = vCoverageMask;

                    // Early-Z?
                    if (T::bCanEarlyZ)
                    {
                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                        AR_END(BEEarlyDepthTest, 0);

                        // early-exit if no samples passed depth or earlyZ is forced on.
                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                        {
                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

                            if (!_simd_movemask_ps(depthPassMask))
                            {
                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                                continue;
                            }
                        }
                    }

                    psContext.sampleIndex = sample;
                    psContext.activeMask = _simd_castps_si(vCoverageMask);

                    // execute pixel shader
                    AR_BEGIN(BEPixelShader, pDC->drawId);
                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
                    AR_END(BEPixelShader, 0);

                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);

                    // late-Z
                    if (!T::bCanEarlyZ)
                    {
                        AR_BEGIN(BELateDepthTest, pDC->drawId);
                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                        AR_END(BELateDepthTest, 0);

                        if (!_simd_movemask_ps(depthPassMask))
                        {
                            // need to call depth/stencil write for stencil write
                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                            continue;
                        }
                    }

                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
                    uint32_t statCount = _mm_popcnt_u32(statMask);
                    UPDATE_STAT_BE(DepthPassCount, statCount);

                    // output merger
                    AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
                    OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
#else
                    OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
#endif

                    // do final depth write after all pixel kills
                    if (!state.psState.forceEarlyZ)
                    {
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                    }
                    AR_END(BEOutputMerger, 0);
                }