Example #1
0
//////////////////////////////////////////////////////////////////////////
/// @brief Called when FE work is complete for this DC.
INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
{
    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE)
    {
        SWR_STATS_FE& stats = pDC->dynState.statsFE;

        AR_EVENT(FrontendStatsEvent(pDC->drawId,
            stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
            stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
            stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
            stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
        ));
		AR_EVENT(FrontendDrawEndEvent(pDC->drawId));

        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
    }

    if (pContext->pfnUpdateSoWriteOffset)
    {
        for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
        {
            if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
                (pDC->pState->state.soBuffer[i].soWriteEnable))
            {
                pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
            }
        }
    }

    // Ensure all streaming writes are globally visible before marking this FE done
    _mm_mfence();
    pDC->doneFE = true;

    InterlockedDecrement(&pContext->drawsOutstandingFE);
}
Example #2
0
//////////////////////////////////////////////////////////////////////////
/// @brief Called when FE work is complete for this DC.
INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
    _ReadWriteBarrier();

    if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
    {
        pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
    }

    if (pContext->pfnUpdateSoWriteOffset)
    {
        for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)
        {
            if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
                (pDC->pState->state.soBuffer[i].soWriteEnable))
            {
                pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
            }
        }
    }

    pDC->doneFE = true;

    InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
}
Example #3
0
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
    SWR_RENDERTARGET_ATTACHMENT attachment)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BEStoreTiles, pDC->drawId);

    SWR_FORMAT srcFormat;
    switch (attachment)
    {
    case SWR_ATTACHMENT_COLOR0:
    case SWR_ATTACHMENT_COLOR1:
    case SWR_ATTACHMENT_COLOR2:
    case SWR_ATTACHMENT_COLOR3:
    case SWR_ATTACHMENT_COLOR4:
    case SWR_ATTACHMENT_COLOR5:
    case SWR_ATTACHMENT_COLOR6:
    case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
    case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
    default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    }

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroTile, x, y);

    // Only need to store the hottile if it's been rendered to...
    HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
    if (pHotTile)
    {
        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
        if (pHotTile->state == HOTTILE_CLEAR)
        {
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
            SWR_ASSERT(pfnClearTiles != nullptr);

            pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
        }

        if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
        {
            int32_t destX = KNOB_MACROTILE_X_DIM * x;
            int32_t destY = KNOB_MACROTILE_Y_DIM * y;

            pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
                attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
        }
        

        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
        {
            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
            {
                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
            }
        }
    }
    AR_END(BEStoreTiles, 1);
}
Example #4
0
//////////////////////////////////////////////////////////////////////////
/// @brief Update client stats.
INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
    if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStats == false))
    {
        return;
    }

    DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
    SWR_STATS stats{ 0 };

    // Sum up stats across all workers before sending to client.
    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
    {
        stats.DepthPassCount += dynState.pStats[i].DepthPassCount;

        stats.PsInvocations  += dynState.pStats[i].PsInvocations;
        stats.CsInvocations  += dynState.pStats[i].CsInvocations;
    }

    pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
}
Example #5
0
//////////////////////////////////////////////////////////////////////////
/// @brief Process compute work.
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BEDispatch, pDC->drawId);

    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
    SWR_ASSERT(pTaskData != nullptr);

    // Ensure spill fill memory has been allocated.
    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
    if (spillFillSize && pSpillFillBuffer == nullptr)
    {
        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
    }
    
    size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
    if (scratchSpaceSize && pScratchSpace == nullptr)
    {
        pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
    }

    const API_STATE& state = GetApiState(pDC);

    SWR_CS_CONTEXT csContext{ 0 };
    csContext.tileCounter = threadGroupId;
    csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
    csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
    csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
    csContext.pTGSM = pContext->ppScratch[workerId];
    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
    csContext.pScratchSpace = (uint8_t*)pScratchSpace;
    csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;

    state.pfnCsFunc(GetPrivateState(pDC), &csContext);

    UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);

    AR_END(BEDispatch, 1);
}
Example #6
0
HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT*                pContext,
                                DRAW_CONTEXT*               pDC,
                                HANDLE                      hWorkerPrivateData,
                                uint32_t                    macroID,
                                SWR_RENDERTARGET_ATTACHMENT attachment,
                                bool                        create,
                                uint32_t                    numSamples,
                                uint32_t                    renderTargetArrayIndex)
{
    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);

    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);

    HotTileSet& tile    = mHotTiles[x][y];
    HOTTILE&    hotTile = tile.Attachment[attachment];
    if (hotTile.pBuffer == NULL)
    {
        if (create)
        {
            uint32_t size     = numSamples * mHotTileSize[attachment];
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
            hotTile.pBuffer =
                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
            hotTile.state                  = HOTTILE_INVALID;
            hotTile.numSamples             = numSamples;
            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
        }
        else
        {
            return NULL;
        }
    }
    else
    {
        // free the old tile and create a new one with enough space to hold all samples
        if (numSamples > hotTile.numSamples)
        {
            // tile should be either uninitialized or resolved if we're deleting and switching to a
            // new sample count
            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) ||
                       (hotTile.state == HOTTILE_CLEAR));
            FreeHotTileMem(hotTile.pBuffer);

            uint32_t size     = numSamples * mHotTileSize[attachment];
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
            hotTile.pBuffer =
                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
            hotTile.state      = HOTTILE_INVALID;
            hotTile.numSamples = numSamples;
        }

        // if requested render target array index isn't currently loaded, need to store out the
        // current hottile and load the requested array slice
        if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
        {
            SWR_FORMAT format;
            switch (attachment)
            {
            case SWR_ATTACHMENT_COLOR0:
            case SWR_ATTACHMENT_COLOR1:
            case SWR_ATTACHMENT_COLOR2:
            case SWR_ATTACHMENT_COLOR3:
            case SWR_ATTACHMENT_COLOR4:
            case SWR_ATTACHMENT_COLOR5:
            case SWR_ATTACHMENT_COLOR6:
            case SWR_ATTACHMENT_COLOR7:
                format = KNOB_COLOR_HOT_TILE_FORMAT;
                break;
            case SWR_ATTACHMENT_DEPTH:
                format = KNOB_DEPTH_HOT_TILE_FORMAT;
                break;
            case SWR_ATTACHMENT_STENCIL:
                format = KNOB_STENCIL_HOT_TILE_FORMAT;
                break;
            default:
                SWR_INVALID("Unknown attachment: %d", attachment);
                format = KNOB_COLOR_HOT_TILE_FORMAT;
                break;
            }

            if (hotTile.state == HOTTILE_CLEAR)
            {
                if (attachment == SWR_ATTACHMENT_STENCIL)
                    ClearStencilHotTile(&hotTile);
                else if (attachment == SWR_ATTACHMENT_DEPTH)
                    ClearDepthHotTile(&hotTile);
                else
                    ClearColorHotTile(&hotTile);

                hotTile.state = HOTTILE_DIRTY;
            }

            if (hotTile.state == HOTTILE_DIRTY)
            {
                pContext->pfnStoreTile(GetPrivateState(pDC),
                                       hWorkerPrivateData,
                                       format,
                                       attachment,
                                       x * KNOB_MACROTILE_X_DIM,
                                       y * KNOB_MACROTILE_Y_DIM,
                                       hotTile.renderTargetArrayIndex,
                                       hotTile.pBuffer);
            }

            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  format,
                                  attachment,
                                  x * KNOB_MACROTILE_X_DIM,
                                  y * KNOB_MACROTILE_Y_DIM,
                                  renderTargetArrayIndex,
                                  hotTile.pBuffer);

            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
            hotTile.state                  = HOTTILE_DIRTY;
        }
    }
    return &tile.Attachment[attachment];
}
Example #7
0
//////////////////////////////////////////////////////////////////////////
/// @brief InitializeHotTiles
/// for draw calls, we initialize the active hot tiles and perform deferred
/// load on them if tile is in invalid state. we do this in the outer thread
/// loop instead of inside the draw routine itself mainly for performance,
/// to avoid unnecessary setup every triangle
/// @todo support deferred clear
/// @param pCreateInfo - pointer to creation info.
void HotTileMgr::InitializeHotTiles(SWR_CONTEXT*  pContext,
                                    DRAW_CONTEXT* pDC,
                                    uint32_t      workerId,
                                    uint32_t      macroID)
{
    const API_STATE& state    = GetApiState(pDC);
    HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);
    x *= KNOB_MACROTILE_X_DIM;
    y *= KNOB_MACROTILE_Y_DIM;

    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);

    // check RT if enabled
    unsigned long rtSlot                 = 0;
    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
    {
        HOTTILE* pHotTile =
            GetHotTile(pContext,
                       pDC,
                       hWorkerPrivateData,
                       macroID,
                       (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
                       true,
                       numSamples);

        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  KNOB_COLOR_HOT_TILE_FORMAT,
                                  (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
                                  x,
                                  y,
                                  pHotTile->renderTargetArrayIndex,
                                  pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // Clear the tile.
            ClearColorHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        colorHottileEnableMask &= ~(1 << rtSlot);
    }

    // check depth if enabled
    if (state.depthHottileEnable)
    {
        HOTTILE* pHotTile = GetHotTile(
            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  KNOB_DEPTH_HOT_TILE_FORMAT,
                                  SWR_ATTACHMENT_DEPTH,
                                  x,
                                  y,
                                  pHotTile->renderTargetArrayIndex,
                                  pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // Clear the tile.
            ClearDepthHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
    }

    // check stencil if enabled
    if (state.stencilHottileEnable)
    {
        HOTTILE* pHotTile = GetHotTile(
            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  KNOB_STENCIL_HOT_TILE_FORMAT,
                                  SWR_ATTACHMENT_STENCIL,
                                  x,
                                  y,
                                  pHotTile->renderTargetArrayIndex,
                                  pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_BEGIN(BELoadTiles, pDC->drawId);
            // Clear the tile.
            ClearStencilHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_END(BELoadTiles, 0);
        }
    }
}
Example #8
0
void BackendSingleSample(DRAW_CONTEXT*        pDC,
                         uint32_t             workerId,
                         uint32_t             x,
                         uint32_t             y,
                         SWR_TRIANGLE_DESC&   work,
                         RenderOutputBuffers& renderBuffers)
{
    RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
    RDTSC_BEGIN(BESetup, pDC->drawId);

    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    const API_STATE& state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    SWR_PS_CONTEXT             psContext;
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    SetupPixelShaderContext<T>(&psContext, samplePos, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(psContext.pColorBuffer,
                       &pDepthBuffer,
                       &pStencilBuffer,
                       state.colorHottileEnable,
                       renderBuffers);

    RDTSC_END(BESetup, 1);

    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));

    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
#if USE_8x2_TILE_BACKEND
            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
            simdmask coverageMask = work.coverageMask[0] & MASK;

            if (coverageMask)
            {
                if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                {
                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
                                  "Unsupported depth hot tile format");

                    const simdscalar z =
                        _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));

                    const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                    const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                    coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                }

                if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
                {
                    const uint64_t* pCoverageMask =
                        (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
                            ? &work.innerCoverageMask
                            : &work.coverageMask[0];

                    generateInputCoverage<T, T::InputCoverage>(
                        pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
                }

                RDTSC_BEGIN(BEBarycentric, pDC->drawId);

                CalcPixelBarycentrics(coeffs, psContext);

                CalcCentroid<T, true>(
                    &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);

                // interpolate and quantize z
                psContext.vZ = vplaneps(
                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                RDTSC_END(BEBarycentric, 1);

                // interpolate user clip distance if available
                if (state.backendState.clipDistanceMask)
                {
                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
                                                         work.pUserClipBuffer,
                                                         psContext.vI.center,
                                                         psContext.vJ.center);
                }

                simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
                simdscalar depthPassMask   = vCoverageMask;
                simdscalar stencilPassMask = vCoverageMask;

                // Early-Z?
                if (T::bCanEarlyZ)
                {
                    RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
                    depthPassMask = DepthStencilTest(&state,
                                                     work.triFlags.frontFacing,
                                                     work.triFlags.viewportIndex,
                                                     psContext.vZ,
                                                     pDepthBuffer,
                                                     vCoverageMask,
                                                     pStencilBuffer,
                                                     &stencilPassMask);
                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
                                                               _simd_movemask_ps(stencilPassMask),
                                                               _simd_movemask_ps(vCoverageMask)));
                    RDTSC_END(BEEarlyDepthTest, 0);

                    // early-exit if no pixels passed depth or earlyZ is forced on
                    if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                    {
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                          &state.depthStencilState,
                                          work.triFlags.frontFacing,
                                          psContext.vZ,
                                          pDepthBuffer,
                                          depthPassMask,
                                          vCoverageMask,
                                          pStencilBuffer,
                                          stencilPassMask);

                        if (!_simd_movemask_ps(depthPassMask))
                        {
                            goto Endtile;
                        }
                    }
                }

                psContext.sampleIndex = 0;
                psContext.activeMask  = _simd_castps_si(vCoverageMask);

                // execute pixel shader
                RDTSC_BEGIN(BEPixelShader, pDC->drawId);
                state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext);
                RDTSC_END(BEPixelShader, 0);

                // update stats
                UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                AR_EVENT(PSStats(psContext.stats.numInstExecuted));

                vCoverageMask = _simd_castsi_ps(psContext.activeMask);

                // late-Z
                if (!T::bCanEarlyZ)
                {
                    RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
                    depthPassMask = DepthStencilTest(&state,
                                                     work.triFlags.frontFacing,
                                                     work.triFlags.viewportIndex,
                                                     psContext.vZ,
                                                     pDepthBuffer,
                                                     vCoverageMask,
                                                     pStencilBuffer,
                                                     &stencilPassMask);
                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
                                                              _simd_movemask_ps(stencilPassMask),
                                                              _simd_movemask_ps(vCoverageMask)));
                    RDTSC_END(BELateDepthTest, 0);

                    if (!_simd_movemask_ps(depthPassMask))
                    {
                        // need to call depth/stencil write for stencil write
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                          &state.depthStencilState,
                                          work.triFlags.frontFacing,
                                          psContext.vZ,
                                          pDepthBuffer,
                                          depthPassMask,
                                          vCoverageMask,
                                          pStencilBuffer,
                                          stencilPassMask);
                        goto Endtile;
                    }
                }
                else
                {
                    // for early z, consolidate discards from shader
                    // into depthPassMask
                    depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
                }

                uint32_t statMask  = _simd_movemask_ps(depthPassMask);
                uint32_t statCount = _mm_popcnt_u32(statMask);
                UPDATE_STAT_BE(DepthPassCount, statCount);

                // output merger
                RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
                OutputMerger8x2(pDC,
                                psContext,
                                psContext.pColorBuffer,
                                0,
                                &state.blendState,
                                state.pfnBlendFunc,
                                vCoverageMask,
                                depthPassMask,
                                state.psState.renderTargetMask,
                                useAlternateOffset,
                                workerId);
#else
                OutputMerger4x2(pDC,
                                psContext,
                                psContext.pColorBuffer,
                                0,
                                &state.blendState,
                                state.pfnBlendFunc,
                                vCoverageMask,
                                depthPassMask,
                                state.psState.renderTargetMask,
                                workerId,
                                workerId);
#endif

                // do final depth write after all pixel kills
                if (!state.psState.forceEarlyZ)
                {
                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
                                      &state.depthStencilState,
                                      work.triFlags.frontFacing,
                                      psContext.vZ,
                                      pDepthBuffer,
                                      depthPassMask,
                                      vCoverageMask,
                                      pStencilBuffer,
                                      stencilPassMask);
                }
                RDTSC_END(BEOutputMerger, 0);
            }

        Endtile:
            RDTSC_BEGIN(BEEndTile, pDC->drawId);

            work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
            {
                work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
            }

#if USE_8x2_TILE_BACKEND
            if (useAlternateOffset)
            {
                DWORD    rt;
                uint32_t rtMask = state.colorHottileEnable;
                while (_BitScanForward(&rt, rtMask))
                {
                    rtMask &= ~(1 << rt);
                    psContext.pColorBuffer[rt] +=
                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                }
            }
#else
            DWORD rt;
            uint32_t rtMask = state.colorHottileEnable;
            while (_BitScanForward(&rt, rtMask))
            {
                rtMask &= ~(1 << rt);
                psContext.pColorBuffer[rt] +=
                    (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
            }
#endif
            pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
            pStencilBuffer +=
                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;

            RDTSC_END(BEEndTile, 0);

            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
            psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
        }

        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
        psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
    }
Example #9
0
// for draw calls, we initialize the active hot tiles and perform deferred
// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
// the draw routine itself mainly for performance, to avoid unnecessary setup
// every triangle
// @todo support deferred clear
INLINE
void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
{
    const API_STATE& state = GetApiState(pDC);
    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
    const SWR_PS_STATE& psState = state.psState;
    uint32_t numRTs = psState.maxRTSlotUsed + 1;

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);
    x *= KNOB_MACROTILE_X_DIM;
    y *= KNOB_MACROTILE_Y_DIM;

    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);

    // check RT if enabled
    if (state.psState.pfnPixelShader != nullptr)
    {
        for (uint32_t rt = 0; rt < numRTs; ++rt)
        {
            HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples);

            if (pHotTile->state == HOTTILE_INVALID)
            {
                RDTSC_START(BELoadTiles);
                // invalid hottile before draw requires a load from surface before we can draw to it
                pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
                pHotTile->state = HOTTILE_DIRTY;
                RDTSC_STOP(BELoadTiles, 0, 0);
            }
            else if (pHotTile->state == HOTTILE_CLEAR)
            {
                RDTSC_START(BELoadTiles);
                // Clear the tile.
                ClearColorHotTile(pHotTile);
                pHotTile->state = HOTTILE_DIRTY;
                RDTSC_STOP(BELoadTiles, 0, 0);
            }
        }
    }

    // check depth if enabled
    if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable)
    {
        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_START(BELoadTiles);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_START(BELoadTiles);
            // Clear the tile.
            ClearDepthHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
    }

    // check stencil if enabled
    if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable)
    {
        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
        if (pHotTile->state == HOTTILE_INVALID)
        {
            RDTSC_START(BELoadTiles);
            // invalid hottile before draw requires a load from surface before we can draw to it
            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
        else if (pHotTile->state == HOTTILE_CLEAR)
        {
            RDTSC_START(BELoadTiles);
            // Clear the tile.
            ClearStencilHotTile(pHotTile);
            pHotTile->state = HOTTILE_DIRTY;
            RDTSC_STOP(BELoadTiles, 0, 0);
        }
    }
}
Example #10
0
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BESampleRateBackend, pDC->drawId);
    AR_BEGIN(BESetup, pDC->drawId);

    const API_STATE &state = GetApiState(pDC);

    BarycentricCoeffs coeffs;
    SetupBarycentricCoeffs(&coeffs, work);

    SWR_PS_CONTEXT psContext;
    const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
    SetupPixelShaderContext<T>(&psContext, samplePos, work);

    uint8_t *pDepthBuffer, *pStencilBuffer;
    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);

    AR_END(BESetup, 0);

    psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
    psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));

    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));

    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
    {
        psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
        psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));

        const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));

        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
        {
#if USE_8x2_TILE_BACKEND
            const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
#endif
            if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
            {
                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];

                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
            }

            AR_BEGIN(BEBarycentric, pDC->drawId);

            CalcPixelBarycentrics(coeffs, psContext);

            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);

            AR_END(BEBarycentric, 0);

            for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++)
            {
                simdmask coverageMask = work.coverageMask[sample] & MASK;

                if (coverageMask)
                {
                    // offset depth/stencil buffers current sample
                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);

                    if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                    {
                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");

                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));

                        const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                        const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;

                        coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz);
                    }

                    AR_BEGIN(BEBarycentric, pDC->drawId);

                    // calculate per sample positions
                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample));
                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample));

                    CalcSampleBarycentrics(coeffs, psContext);

                    // interpolate and quantize z
                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);

                    AR_END(BEBarycentric, 0);

                    // interpolate user clip distance if available
                    if (state.backendState.clipDistanceMask)
                    {
                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
                    }

                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
                    simdscalar depthPassMask = vCoverageMask;
                    simdscalar stencilPassMask = vCoverageMask;

                    // Early-Z?
                    if (T::bCanEarlyZ)
                    {
                        AR_BEGIN(BEEarlyDepthTest, pDC->drawId);
                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                        AR_END(BEEarlyDepthTest, 0);

                        // early-exit if no samples passed depth or earlyZ is forced on.
                        if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                        {
                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

                            if (!_simd_movemask_ps(depthPassMask))
                            {
                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                                continue;
                            }
                        }
                    }

                    psContext.sampleIndex = sample;
                    psContext.activeMask = _simd_castps_si(vCoverageMask);

                    // execute pixel shader
                    AR_BEGIN(BEPixelShader, pDC->drawId);
                    UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
                    AR_END(BEPixelShader, 0);

                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);

                    // late-Z
                    if (!T::bCanEarlyZ)
                    {
                        AR_BEGIN(BELateDepthTest, pDC->drawId);
                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
                        AR_END(BELateDepthTest, 0);

                        if (!_simd_movemask_ps(depthPassMask))
                        {
                            // need to call depth/stencil write for stencil write
                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                            continue;
                        }
                    }

                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
                    uint32_t statCount = _mm_popcnt_u32(statMask);
                    UPDATE_STAT_BE(DepthPassCount, statCount);

                    // output merger
                    AR_BEGIN(BEOutputMerger, pDC->drawId);
#if USE_8x2_TILE_BACKEND
                    OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset);
#else
                    OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask);
#endif

                    // do final depth write after all pixel kills
                    if (!state.psState.forceEarlyZ)
                    {
                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
                    }
                    AR_END(BEOutputMerger, 0);
                }