////////////////////////////////////////////////////////////////////////// /// @brief Called when FE work is complete for this DC. INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) { if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStatsFE) { SWR_STATS_FE& stats = pDC->dynState.statsFE; AR_EVENT(FrontendStatsEvent(pDC->drawId, stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations, stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives, stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3], stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3] )); AR_EVENT(FrontendDrawEndEvent(pDC->drawId)); pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats); } if (pContext->pfnUpdateSoWriteOffset) { for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i) { if ((pDC->dynState.SoWriteOffsetDirty[i]) && (pDC->pState->state.soBuffer[i].soWriteEnable)) { pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); } } } // Ensure all streaming writes are globally visible before marking this FE done _mm_mfence(); pDC->doneFE = true; InterlockedDecrement(&pContext->drawsOutstandingFE); }
////////////////////////////////////////////////////////////////////////// /// @brief Called when FE work is complete for this DC. INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { _ReadWriteBarrier(); if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats) { pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE); } if (pContext->pfnUpdateSoWriteOffset) { for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i) { if ((pDC->dynState.SoWriteOffsetDirty[i]) && (pDC->pState->state.soBuffer[i].soWriteEnable)) { pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]); } } } pDC->doneFE = true; InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE); }
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, SWR_RENDERTARGET_ATTACHMENT attachment) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BEStoreTiles, pDC->drawId); SWR_FORMAT srcFormat; switch (attachment) { case SWR_ATTACHMENT_COLOR0: case SWR_ATTACHMENT_COLOR1: case SWR_ATTACHMENT_COLOR2: case SWR_ATTACHMENT_COLOR3: case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; } uint32_t x, y; MacroTileMgr::getTileIndices(macroTile, x, y); // Only need to store the hottile if it's been rendered to... HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); if (pHotTile) { // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. if (pHotTile->state == HOTTILE_CLEAR) { PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; SWR_ASSERT(pfnClearTiles != nullptr); pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); } if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) { int32_t destX = KNOB_MACROTILE_X_DIM * x; int32_t destY = KNOB_MACROTILE_Y_DIM * y; pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); } if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) { if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED)) { pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; } } } AR_END(BEStoreTiles, 1); }
////////////////////////////////////////////////////////////////////////// /// @brief Update client stats. INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { if ((pContext->pfnUpdateStats == nullptr) || (GetApiState(pDC).enableStats == false)) { return; } DRAW_DYNAMIC_STATE& dynState = pDC->dynState; SWR_STATS stats{ 0 }; // Sum up stats across all workers before sending to client. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { stats.DepthPassCount += dynState.pStats[i].DepthPassCount; stats.PsInvocations += dynState.pStats[i].PsInvocations; stats.CsInvocations += dynState.pStats[i].CsInvocations; } pContext->pfnUpdateStats(GetPrivateState(pDC), &stats); }
////////////////////////////////////////////////////////////////////////// /// @brief Process compute work. /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BEDispatch, pDC->drawId); const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); SWR_ASSERT(pTaskData != nullptr); // Ensure spill fill memory has been allocated. size_t spillFillSize = pDC->pState->state.totalSpillFillSize; if (spillFillSize && pSpillFillBuffer == nullptr) { pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES); } size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances; if (scratchSpaceSize && pScratchSpace == nullptr) { pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES); } const API_STATE& state = GetApiState(pDC); SWR_CS_CONTEXT csContext{ 0 }; csContext.tileCounter = threadGroupId; csContext.dispatchDims[0] = pTaskData->threadGroupCountX; csContext.dispatchDims[1] = pTaskData->threadGroupCountY; csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; csContext.pTGSM = pContext->ppScratch[workerId]; csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; csContext.pScratchSpace = (uint8_t*)pScratchSpace; csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize; state.pfnCsFunc(GetPrivateState(pDC), &csContext); UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup); AR_END(BEDispatch, 1); }
HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, uint32_t renderTargetArrayIndex) { uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); HotTileSet& tile = mHotTiles[x][y]; HOTTILE& hotTile = tile.Attachment[attachment]; if (hotTile.pBuffer == NULL) { if (create) { uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = renderTargetArrayIndex; } else { return NULL; } } else { // free the old tile and create a new one with enough space to hold all samples if (numSamples > hotTile.numSamples) { // tile should be either uninitialized or resolved if we're deleting and switching to a // new sample count SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) || (hotTile.state == HOTTILE_CLEAR)); FreeHotTileMem(hotTile.pBuffer); uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; } // if requested render target array index isn't currently loaded, need to store out the // current hottile and load the requested array slice if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) { SWR_FORMAT format; switch (attachment) { case SWR_ATTACHMENT_COLOR0: case SWR_ATTACHMENT_COLOR1: case SWR_ATTACHMENT_COLOR2: case SWR_ATTACHMENT_COLOR3: case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; } if (hotTile.state == HOTTILE_CLEAR) { if (attachment == SWR_ATTACHMENT_STENCIL) ClearStencilHotTile(&hotTile); else if (attachment == SWR_ATTACHMENT_DEPTH) ClearDepthHotTile(&hotTile); else ClearColorHotTile(&hotTile); hotTile.state = HOTTILE_DIRTY; } if (hotTile.state == HOTTILE_DIRTY) { pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); } pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); hotTile.renderTargetArrayIndex = renderTargetArrayIndex; hotTile.state = HOTTILE_DIRTY; } } return &tile.Attachment[attachment]; }
////////////////////////////////////////////////////////////////////////// /// @brief InitializeHotTiles /// for draw calls, we initialize the active hot tiles and perform deferred /// load on them if tile is in invalid state. we do this in the outer thread /// loop instead of inside the draw routine itself mainly for performance, /// to avoid unnecessary setup every triangle /// @todo support deferred clear /// @param pCreateInfo - pointer to creation info. void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID) { const API_STATE& state = GetApiState(pDC); HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); x *= KNOB_MACROTILE_X_DIM; y *= KNOB_MACROTILE_Y_DIM; uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); // check RT if enabled unsigned long rtSlot = 0; uint32_t colorHottileEnableMask = state.colorHottileEnable; while (_BitScanForward(&rtSlot, colorHottileEnableMask)) { HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // Clear the tile. ClearColorHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } colorHottileEnableMask &= ~(1 << rtSlot); } // check depth if enabled if (state.depthHottileEnable) { HOTTILE* pHotTile = GetHotTile( pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // Clear the tile. ClearDepthHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } } // check stencil if enabled if (state.stencilHottileEnable) { HOTTILE* pHotTile = GetHotTile( pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_BEGIN(BELoadTiles, pDC->drawId); // Clear the tile. ClearStencilHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_END(BELoadTiles, 0); } } }
void BackendSingleSample(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC& work, RenderOutputBuffers& renderBuffers) { RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId); RDTSC_BEGIN(BESetup, pDC->drawId); void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; const API_STATE& state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); RDTSC_END(BESetup, 1); psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { #if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); #endif simdmask coverageMask = work.coverageMask[0] & MASK; if (coverageMask) { if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); } if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; generateInputCoverage<T, T::InputCoverage>( pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } RDTSC_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); CalcCentroid<T, true>( &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); // interpolate and quantize z psContext.vZ = vplaneps( coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_END(BEBarycentric, 1); // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // Early-Z? if (T::bCanEarlyZ) { RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); RDTSC_END(BEEarlyDepthTest, 0); // early-exit if no pixels passed depth or earlyZ is forced on if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) { goto Endtile; } } } psContext.sampleIndex = 0; psContext.activeMask = _simd_castps_si(vCoverageMask); // execute pixel shader RDTSC_BEGIN(BEPixelShader, pDC->drawId); state.psState.pfnPixelShader(GetPrivateState(pDC), pWorkerData, &psContext); RDTSC_END(BEPixelShader, 0); // update stats UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); AR_EVENT(PSStats(psContext.stats.numInstExecuted)); vCoverageMask = _simd_castsi_ps(psContext.activeMask); // late-Z if (!T::bCanEarlyZ) { RDTSC_BEGIN(BELateDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask); AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); RDTSC_END(BELateDepthTest, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); goto Endtile; } } else { // for early z, consolidate discards from shader // into depthPassMask depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask); } uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); // output merger RDTSC_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset, workerId); #else OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, workerId, workerId); #endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask); } RDTSC_END(BEOutputMerger, 0); } Endtile: RDTSC_BEGIN(BEEndTile, pDC->drawId); work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) { work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } #if USE_8x2_TILE_BACKEND if (useAlternateOffset) { DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } } #else DWORD rt; uint32_t rtMask = state.colorHottileEnable; while (_BitScanForward(&rt, rtMask)) { rtMask &= ~(1 << rt); psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; } #endif pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; RDTSC_END(BEEndTile, 0); psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx); psContext.vX.center = _simd_add_ps(psContext.vX.center, dx); } psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy); psContext.vY.center = _simd_add_ps(psContext.vY.center, dy); }
// for draw calls, we initialize the active hot tiles and perform deferred // load on them if tile is in invalid state. we do this in the outer thread loop instead of inside // the draw routine itself mainly for performance, to avoid unnecessary setup // every triangle // @todo support deferred clear INLINE void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) { const API_STATE& state = GetApiState(pDC); HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; const SWR_PS_STATE& psState = state.psState; uint32_t numRTs = psState.maxRTSlotUsed + 1; uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); x *= KNOB_MACROTILE_X_DIM; y *= KNOB_MACROTILE_Y_DIM; uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); // check RT if enabled if (state.psState.pfnPixelShader != nullptr) { for (uint32_t rt = 0; rt < numRTs; ++rt) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rt), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearColorHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } } // check depth if enabled if (state.depthStencilState.depthTestEnable || state.depthStencilState.depthWriteEnable) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearDepthHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } // check stencil if enabled if (state.depthStencilState.stencilTestEnable || state.depthStencilState.stencilWriteEnable) { HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); if (pHotTile->state == HOTTILE_INVALID) { RDTSC_START(BELoadTiles); // invalid hottile before draw requires a load from surface before we can draw to it pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } else if (pHotTile->state == HOTTILE_CLEAR) { RDTSC_START(BELoadTiles); // Clear the tile. ClearStencilHotTile(pHotTile); pHotTile->state = HOTTILE_DIRTY; RDTSC_STOP(BELoadTiles, 0, 0); } } }
void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BESampleRateBackend, pDC->drawId); AR_BEGIN(BESetup, pDC->drawId); const API_STATE &state = GetApiState(pDC); BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); SWR_PS_CONTEXT psContext; const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions; SetupPixelShaderContext<T>(&psContext, samplePos, work); uint8_t *pDepthBuffer, *pStencilBuffer; SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); AR_END(BESetup, 0); psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y))); psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y))); const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM)); for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x))); psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x))); const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { #if USE_8x2_TILE_BACKEND const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0); #endif if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0]; generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask); } AR_BEGIN(BEBarycentric, pDC->drawId); CalcPixelBarycentrics(coeffs, psContext); CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask); AR_END(BEBarycentric, 0); for (uint32_t sample = 0; sample < T::MultisampleT::numSamples; sample++) { simdmask coverageMask = work.coverageMask[sample] & MASK; if (coverageMask) { // offset depth/stencil buffers current sample uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample); uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample); if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample)); const float minz = state.depthBoundsState.depthBoundsTestMinValue; const float maxz = state.depthBoundsState.depthBoundsTestMaxValue; coverageMask &= CalcDepthBoundsAcceptMask(z, minz, maxz); } AR_BEGIN(BEBarycentric, pDC->drawId); // calculate per sample positions psContext.vX.sample = _simd_add_ps(psContext.vX.UL, samplePos.vX(sample)); psContext.vY.sample = _simd_add_ps(psContext.vY.UL, samplePos.vY(sample)); CalcSampleBarycentrics(coeffs, psContext); // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); AR_END(BEBarycentric, 0); // interpolate user clip distance if available if (state.backendState.clipDistanceMask) { coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample); } simdscalar vCoverageMask = _simd_vmask_ps(coverageMask); simdscalar depthPassMask = vCoverageMask; simdscalar stencilPassMask = vCoverageMask; // Early-Z? if (T::bCanEarlyZ) { AR_BEGIN(BEEarlyDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); AR_END(BEEarlyDepthTest, 0); // early-exit if no samples passed depth or earlyZ is forced on. if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask)) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); if (!_simd_movemask_ps(depthPassMask)) { work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); continue; } } } psContext.sampleIndex = sample; psContext.activeMask = _simd_castps_si(vCoverageMask); // execute pixel shader AR_BEGIN(BEPixelShader, pDC->drawId); UPDATE_STAT_BE(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask))); state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext); AR_END(BEPixelShader, 0); vCoverageMask = _simd_castsi_ps(psContext.activeMask); // late-Z if (!T::bCanEarlyZ) { AR_BEGIN(BELateDepthTest, pDC->drawId); depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask))); AR_END(BELateDepthTest, 0); if (!_simd_movemask_ps(depthPassMask)) { // need to call depth/stencil write for stencil write DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); continue; } } uint32_t statMask = _simd_movemask_ps(depthPassMask); uint32_t statCount = _mm_popcnt_u32(statMask); UPDATE_STAT_BE(DepthPassCount, statCount); // output merger AR_BEGIN(BEOutputMerger, pDC->drawId); #if USE_8x2_TILE_BACKEND OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset); #else OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask); #endif // do final depth write after all pixel kills if (!state.psState.forceEarlyZ) { DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); } AR_END(BEOutputMerger, 0); }