HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples) { uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); HotTileSet& tile = mHotTiles[x][y]; HOTTILE& hotTile = tile.Attachment[attachment]; if (hotTile.pBuffer == NULL) { if (create) { uint32_t size = numSamples * mHotTileSize[attachment]; hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = 0; } else { return NULL; } } return &hotTile; }
////////////////////////////////////////////////////////////////////////// /// @brief Writes clear color to every pixel of a render surface /// @param hPrivateContext - Handle to private DC /// @param renderTargetIndex - Index to destination render target /// @param x, y - Coordinates to raster tile. /// @param pClearColor - Pointer to clear color void StoreHotTileClear( SWR_SURFACE_STATE *pDstSurface, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, UINT x, UINT y, const float* pClearColor) { PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL; SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL); ///@todo Not supported yet. if (renderTargetIndex != SWR_ATTACHMENT_DEPTH) { pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format]; } else { pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format]; } SWR_ASSERT(pfnStoreTilesClear != NULL); // Store a macro tile. /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress. if (pfnStoreTilesClear != NULL) { pfnStoreTilesClear(pClearColor, pDstSurface, x, y); } }
void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork) { // Should not enqueue more then what we have backing for in the hot tile manager. SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1))) { return; } uint32_t id = getTileId(x, y); if (id >= mTiles.size()) { mTiles.resize((16 + id) * 2); } MacroTileQueue* pTile = mTiles[id]; if (!pTile) { pTile = mTiles[id] = new MacroTileQueue(); } pTile->mWorkItemsFE++; pTile->mId = id; if (pTile->mWorkItemsFE == 1) { pTile->clear(mArena); mDirtyTiles.push_back(pTile); } mWorkItemsProduced++; pTile->enqueue_try_nosync(mArena, pWork); }
API_STATE* GetDrawState(SWR_CONTEXT *pContext) { DRAW_CONTEXT* pDC = GetDrawContext(pContext); SWR_ASSERT(pDC->pState != nullptr); return &pDC->pState->state; }
// inlined-only version INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC) { int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone)); SWR_ASSERT(result >= 0); AR_FLUSH(pDC->drawId); if (result == 0) { ExecuteCallbacks(pContext, workerId, pDC); // Cleanup memory allocations pDC->pArena->Reset(true); if (!pDC->isCompute) { pDC->pTileMgr->initialize(); } if (pDC->cleanupState) { pDC->pState->pArena->Reset(true); } _ReadWriteBarrier(); pContext->dcRing.Dequeue(); // Remove from tail } return result; }
INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { int64_t result = InterlockedDecrement64(&pDC->threadsDone); SWR_ASSERT(result >= 0); if (result == 0) { // Cleanup memory allocations pDC->pArena->Reset(true); if (!pDC->isCompute) { pDC->pTileMgr->initialize(); } if (pDC->cleanupState) { pDC->pState->pArena->Reset(true); } _ReadWriteBarrier(); pContext->dcRing.Dequeue(); // Remove from tail } return result; }
void MacroTileMgr::markTileComplete(uint32_t id) { SWR_ASSERT(mTiles.size() > id); MacroTileQueue& tile = *mTiles[id]; uint32_t numTiles = tile.mWorkItemsFE; InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles); _ReadWriteBarrier(); tile.mWorkItemsBE += numTiles; SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE); // clear out tile, but defer fifo clear until the next DC first queues to it. // this prevents worker threads from constantly locking a completed macro tile tile.mWorkItemsFE = 0; tile.mWorkItemsBE = 0; }
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, SWR_RENDERTARGET_ATTACHMENT attachment) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BEStoreTiles, pDC->drawId); SWR_FORMAT srcFormat; switch (attachment) { case SWR_ATTACHMENT_COLOR0: case SWR_ATTACHMENT_COLOR1: case SWR_ATTACHMENT_COLOR2: case SWR_ATTACHMENT_COLOR3: case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break; default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break; } uint32_t x, y; MacroTileMgr::getTileIndices(macroTile, x, y); // Only need to store the hottile if it's been rendered to... HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false); if (pHotTile) { // clear if clear is pending (i.e., not rendered to), then mark as dirty for store. if (pHotTile->state == HOTTILE_CLEAR) { PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat]; SWR_ASSERT(pfnClearTiles != nullptr); pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect); } if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY) { int32_t destX = KNOB_MACROTILE_X_DIM * x; int32_t destY = KNOB_MACROTILE_Y_DIM * y; pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat, attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); } if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED) { if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED)) { pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState; } } } AR_END(BEStoreTiles, 1); }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any compute work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. void WorkOnCompute( SWR_CONTEXT *pContext, uint32_t workerId, volatile uint64_t& curDrawBE) { if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute == false) return; // check dependencies if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return; } SWR_ASSERT(pDC->pDispatch != nullptr); DispatchQueue& queue = *pDC->pDispatch; // Is there any work remaining? if (queue.getNumQueued() > 0) { bool lastToComplete = false; uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { ProcessComputeBE(pDC, workerId, threadGroupId); lastToComplete = queue.finishedWork(); } _ReadWriteBarrier(); if (lastToComplete) { SWR_ASSERT(queue.isWorkComplete() == true); pDC->doneCompute = true; } } }
void SwrSetBlendFunc( HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc) { SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS); API_STATE *pState = GetDrawState(GetContext(hContext)); pState->pfnBlendFunc[renderTarget] = pfnBlendFunc; }
void SwrSetSoBuffers( HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot) { API_STATE* pState = GetDrawState(GetContext(hContext)); SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot); pState->soBuffer[slot] = *pSoBuffer; }
void SwrSetScissorRects( HANDLE hContext, uint32_t numScissors, const BBOX* pScissors) { SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects."); API_STATE* pState = GetDrawState(GetContext(hContext)); memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX)); };
void SwrSetSoFunc( HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex) { API_STATE* pState = GetDrawState(GetContext(hContext)); SWR_ASSERT(streamIndex < MAX_SO_STREAMS); pState->pfnSoFunc[streamIndex] = pfnSoFunc; }
void SwrSetViewports( HANDLE hContext, uint32_t numViewports, const SWR_VIEWPORT* pViewports, const SWR_VIEWPORT_MATRIX* pMatrices) { SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports."); SWR_CONTEXT *pContext = GetContext(hContext); API_STATE* pState = GetDrawState(pContext); memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports); if (pMatrices != nullptr) { memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports); } else { // Compute default viewport transform. for (uint32_t i = 0; i < numViewports; ++i) { if (pContext->driverType == DX) { pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f; pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f; pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ; pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11; pState->vpMatrix[i].m32 = pState->vp[i].minZ; } else { // Standard, with the exception that Y is inverted. pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f; pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f; pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f; pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00; pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11; pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22; // Now that the matrix is calculated, clip the view coords to screen size. // OpenGL allows for -ve x,y in the viewport. pState->vp[i].x = std::max(pState->vp[i].x, 0.0f); pState->vp[i].y = std::max(pState->vp[i].y, 0.0f); } } } updateGuardband(pState); }
inline int inside(const float v[4]) { switch (ClippingPlane) { case FRUSTUM_LEFT : return (v[0]>=-v[3]); case FRUSTUM_RIGHT : return (v[0]<= v[3]); case FRUSTUM_TOP : return (v[1]>=-v[3]); case FRUSTUM_BOTTOM : return (v[1]<= v[3]); case FRUSTUM_NEAR : return (v[2]>=0.0f); case FRUSTUM_FAR : return (v[2]<= v[3]); default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); return 0; } }
inline void intersect( int s, // index to first edge vertex v0 in pInPts. int p, // index to second edge vertex v1 in pInPts. const float *pInPts, // array of all the input positions. const float *pInAttribs, // array of all attributes for all vertex. All the attributes for each vertex is contiguous. int numInAttribs, // number of attributes per vertex. int i, // output index. float *pOutPts, // array of output positions. We'll write our new intersection point at i*4. float *pOutAttribs) // array of output attributes. We'll write our new attributes at i*numInAttribs. { float t; // Find the parameter of the intersection. // t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc. const float *v1 = &pInPts[s*4]; const float *v2 = &pInPts[p*4]; switch (ClippingPlane) { case FRUSTUM_LEFT: t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break; case FRUSTUM_RIGHT: t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break; case FRUSTUM_TOP: t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break; case FRUSTUM_BOTTOM: t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break; case FRUSTUM_NEAR: t = ComputeInterpFactor(v1[2], v2[2]); break; case FRUSTUM_FAR: t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break; default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane); }; const float *a1 = &pInAttribs[s*numInAttribs]; const float *a2 = &pInAttribs[p*numInAttribs]; float *pOutP = &pOutPts[i*4]; float *pOutA = &pOutAttribs[i*numInAttribs]; // Interpolate new position. for(int j = 0; j < 4; ++j) { pOutP[j] = v1[j] + (v2[j]-v1[j])*t; } // Interpolate Attributes for(int attr = 0; attr < numInAttribs; ++attr) { pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t; } }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any compute work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. void WorkOnCompute( SWR_CONTEXT *pContext, uint32_t workerId, uint32_t& curDrawBE) { uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) { return; } uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute == false) return; // check dependencies if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return; } SWR_ASSERT(pDC->pDispatch != nullptr); DispatchQueue& queue = *pDC->pDispatch; // Is there any work remaining? if (queue.getNumQueued() > 0) { void* pSpillFillBuffer = nullptr; void* pScratchSpace = nullptr; uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace); queue.finishedWork(); } // Ensure all streaming writes are globally visible before moving onto the next draw _mm_mfence(); } } }
void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs) { // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping OSALIGNSIMD(float) tempPts[6 * 4]; OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs); NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs); NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs); SWR_ASSERT(NumOutPts <= 6); *numVerts = NumOutPts; return; }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any compute work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. void WorkOnCompute( SWR_CONTEXT *pContext, uint32_t workerId, uint64_t& curDrawBE) { uint64_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute == false) return; // check dependencies if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return; } SWR_ASSERT(pDC->pDispatch != nullptr); DispatchQueue& queue = *pDC->pDispatch; // Is there any work remaining? if (queue.getNumQueued() > 0) { void* pSpillFillBuffer = nullptr; uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer); queue.finishedWork(); } } } }
////////////////////////////////////////////////////////////////////////// /// @brief Process compute work. /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BEDispatch, pDC->drawId); const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData(); SWR_ASSERT(pTaskData != nullptr); // Ensure spill fill memory has been allocated. size_t spillFillSize = pDC->pState->state.totalSpillFillSize; if (spillFillSize && pSpillFillBuffer == nullptr) { pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES); } size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances; if (scratchSpaceSize && pScratchSpace == nullptr) { pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES); } const API_STATE& state = GetApiState(pDC); SWR_CS_CONTEXT csContext{ 0 }; csContext.tileCounter = threadGroupId; csContext.dispatchDims[0] = pTaskData->threadGroupCountX; csContext.dispatchDims[1] = pTaskData->threadGroupCountY; csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; csContext.pTGSM = pContext->ppScratch[workerId]; csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; csContext.pScratchSpace = (uint8_t*)pScratchSpace; csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize; state.pfnCsFunc(GetPrivateState(pDC), &csContext); UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup); AR_END(BEDispatch, 1); }
bool enqueue_try_nosync(ArenaT& arena, const T* entry) { const float* pSrc = (const float*)entry; float* pDst = (float*)&mCurBlock[mTail]; auto lambda = [&](int32_t i) { __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH); _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); }; const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), "FIFO element size should be multiple of SIMD width."); UnrollerL<0, numSimdLines, 1>::step(lambda); mTail ++; if (mTail == mBlockSize) { if (++mCurBlockIdx < mBlocks.size()) { mCurBlock = mBlocks[mCurBlockIdx]; } else { T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); SWR_ASSERT(newBlock); mBlocks.push_back(newBlock); mCurBlock = newBlock; } mTail = 0; } mNumEntries ++; return true; }
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) { RDTSC_START(APIGetDrawContext); // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; pContext->pCurDrawContext = pCurDrawContext; // Update LastRetiredId UpdateLastRetiredId(pContext); // Need to wait until this draw context is available to use. while (StillDrawing(pContext, pCurDrawContext)) { // Make sure workers are working. WakeAllThreads(pContext); _mm_pause(); } // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; Arena& stateArena = pCurDrawContext->pState->arena; // Copy previous state to current state. if (pContext->pPrevDrawContext) { DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext; // If we're splitting our draw then we can just use the same state from the previous // draw. In this case, we won't increment the DS ring index so the next non-split // draw can receive the state. if (isSplitDraw == false) { CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); stateArena.Reset(); // Reset memory. // Copy private state to new context. if (pPrevDrawContext->pState->pPrivateState != nullptr) { pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float)); memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize); } pContext->curStateId++; // Progress state ring index forward. } else { // If its a split draw then just copy the state pointer over // since its the same draw. pCurDrawContext->pState = pPrevDrawContext->pState; } } else { stateArena.Reset(); // Reset memory. pContext->curStateId++; // Progress state ring index forward. } pCurDrawContext->dependency = 0; pCurDrawContext->arena.Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. pCurDrawContext->inUse = false; pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->nextDrawId++; } else { SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC"); } RDTSC_STOP(APIGetDrawContext, 0, 0); return pContext->pCurDrawContext; }
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) std::vector<KAFFINITY> threadMaskPerProcGroup; static std::mutex m; std::lock_guard<std::mutex> l(m); DWORD bufSize = 0; BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize); SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER); PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize); SWR_ASSERT(pBufferMem); ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / pBufferMem->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask KAFFINITY threadMask = KAFFINITY(1) << threadId; gmask.Mask &= ~threadMask; if (procGroup >= threadMaskPerProcGroup.size()) { threadMaskPerProcGroup.resize(procGroup + 1); } if (threadMaskPerProcGroup[procGroup] & threadMask) { // Already seen this mask. This means that we are in 32-bit mode and // have seen more than 32 HW threads for this procGroup // Don't use it #if defined(_WIN64) SWR_INVALID("Shouldn't get here in 64-bit mode"); #endif continue; } threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId); // Find Numa Node uint32_t numaId = 0; PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) { out_nodes.resize(numaId + 1); } auto& numaNode = out_nodes[numaId]; numaNode.numaId = numaId; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } free(pBufferMem); #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t procId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t physId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { auto data_start = line.find(": ") + 2; procId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; physId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.length() == 0) { if (physId + 1 > out_nodes.size()) out_nodes.resize(physId + 1); auto& numaNode = out_nodes[physId]; numaNode.numaId = physId; if (coreId + 1 > numaNode.cores.size()) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #elif defined(__APPLE__) auto numProcessors = 0; auto numCores = 0; auto numPhysicalIds = 0; int value; size_t size = sizeof(value); int result = sysctlbyname("hw.packages", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numPhysicalIds = value; result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numProcessors = value; result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0); SWR_ASSERT(result == 0); numCores = value; out_nodes.resize(numPhysicalIds); for (auto physId = 0; physId < numPhysicalIds; ++physId) { auto &numaNode = out_nodes[physId]; auto procId = 0; numaNode.cores.resize(numCores); while (procId < numProcessors) { for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId) { auto &core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(procId); } } } out_numThreadsPerProcGroup = 0; for (auto &node : out_nodes) { for (auto &core : node.cores) { out_numThreadsPerProcGroup += core.threadIds.size(); } } #else #error Unsupported platform #endif // Prune empty cores and numa nodes for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); ) { // Erase empty cores (first) for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); ) { if (core_it->threadIds.size() == 0) { core_it = node_it->cores.erase(core_it); } else { ++core_it; } } // Erase empty numa nodes (second) if (node_it->cores.size() == 0) { node_it = out_nodes.erase(node_it); } else { ++node_it; } } }
////////////////////////////////////////////////////////////////////////// /// @brief If there is any BE work then go work on it. /// @param pContext - pointer to SWR context. /// @param workerId - The unique worker ID that is assigned to this thread. /// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread /// has its own curDrawBE counter and this ensures that each worker processes all the /// draws in order. /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its /// own set and each time it fails to lock a macrotile, because its already locked, /// then it will add that tile to the lockedTiles set. As a worker begins to work /// on future draws the lockedTiles ensure that it doesn't work on tiles that may /// still have work pending in a previous draw. Additionally, the lockedTiles is /// hueristic that can steer a worker back to the same macrotile that it had been /// working on in a previous draw. /// @returns true if worker thread should shutdown bool WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet& lockedTiles, uint32_t numaNode, uint32_t numaMask) { bool bShutdown = false; // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. uint32_t drawEnqueued = 0; if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false) { return false; } uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1; // Reset our history for locked tiles. We'll have to re-learn which tiles are locked. lockedTiles.clear(); // Try to work on each draw in order of the available draws in flight. // 1. If we're on curDrawBE, we can work on any macrotile that is available. // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT]; if (pDC->isCompute) return false; // We don't look at compute work. // First wait for FE to be finished with this draw. This keeps threading model simple // but if there are lots of bubbles between draws then serializing FE and BE may // need to be revisited. if (!pDC->doneFE) return false; // If this draw is dependent on a previous draw then we need to bail. if (CheckDependency(pContext, pDC, lastRetiredDraw)) { return false; } // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it. auto ¯oTiles = pDC->pTileMgr->getDirtyTiles(); for (auto tile : macroTiles) { uint32_t tileID = tile->mId; // Only work on tiles for this numa node uint32_t x, y; pDC->pTileMgr->getTileIndices(tileID, x, y); if (((x ^ y) & numaMask) != numaNode) { continue; } if (!tile->getNumQueued()) { continue; } // can only work on this draw if it's not in use by other threads if (lockedTiles.find(tileID) != lockedTiles.end()) { continue; } if (tile->tryLock()) { BE_WORK *pWork; RDTSC_BEGIN(WorkerFoundWork, pDC->drawId); uint32_t numWorkItems = tile->getNumQueued(); SWR_ASSERT(numWorkItems); pWork = tile->peek(); SWR_ASSERT(pWork); if (pWork->type == DRAW) { pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID); } else if (pWork->type == SHUTDOWN) { bShutdown = true; } while ((pWork = tile->peek()) != nullptr) { pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); tile->dequeue(); } RDTSC_END(WorkerFoundWork, numWorkItems); _ReadWriteBarrier(); pDC->pTileMgr->markTileComplete(tileID); // Optimization: If the draw is complete and we're the last one to have worked on it then // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete())) { // We can increment the current BE and safely move to next draw since we know this draw is complete. curDrawBE++; CompleteDrawContextInl(pContext, workerId, pDC); lastRetiredDraw++; lockedTiles.clear(); break; } if (bShutdown) { break; } } else { // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. lockedTiles.insert(tileID); } } } return bShutdown; }
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup) { out_nodes.clear(); out_numThreadsPerProcGroup = 0; #if defined(_WIN32) static std::mutex m; std::lock_guard<std::mutex> l(m); static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; DWORD bufSize = sizeof(buffer); BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information"); uint32_t count = bufSize / buffer->Size; PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer; for (uint32_t i = 0; i < count; ++i) { SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore); for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g) { auto& gmask = pBuffer->Processor.GroupMask[g]; uint32_t threadId = 0; uint32_t procGroup = gmask.Group; Core* pCore = nullptr; uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask); while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask)) { // clear mask gmask.Mask &= ~(KAFFINITY(1) << threadId); // Find Numa Node PROCESSOR_NUMBER procNum = {}; procNum.Group = WORD(procGroup); procNum.Number = UCHAR(threadId); uint32_t numaId = 0; ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId); SWR_ASSERT(ret); // Store data if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; uint32_t coreId = 0; if (nullptr == pCore) { numaNode.cores.push_back(Core()); pCore = &numaNode.cores.back(); pCore->procGroup = procGroup; #if !defined(_WIN64) coreId = (uint32_t)numaNode.cores.size(); if ((coreId * numThreads) >= 32) { // Windows doesn't return threadIds >= 32 for a processor group correctly // when running a 32-bit application. // Just save -1 as the threadId threadId = uint32_t(-1); } #endif } pCore->threadIds.push_back(threadId); if (procGroup == 0) { out_numThreadsPerProcGroup++; } } } pBuffer = PtrAdd(pBuffer, pBuffer->Size); } #elif defined(__linux__) || defined (__gnu_linux__) // Parse /proc/cpuinfo to get full topology std::ifstream input("/proc/cpuinfo"); std::string line; char* c; uint32_t threadId = uint32_t(-1); uint32_t coreId = uint32_t(-1); uint32_t numaId = uint32_t(-1); while (std::getline(input, line)) { if (line.find("processor") != std::string::npos) { if (threadId != uint32_t(-1)) { // Save information. if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(threadId); out_numThreadsPerProcGroup++; } auto data_start = line.find(": ") + 2; threadId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("core id") != std::string::npos) { auto data_start = line.find(": ") + 2; coreId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } if (line.find("physical id") != std::string::npos) { auto data_start = line.find(": ") + 2; numaId = std::strtoul(&line.c_str()[data_start], &c, 10); continue; } } if (threadId != uint32_t(-1)) { // Save information. if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1); auto& numaNode = out_nodes[numaId]; if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1); auto& core = numaNode.cores[coreId]; core.procGroup = coreId; core.threadIds.push_back(threadId); out_numThreadsPerProcGroup++; } for (uint32_t node = 0; node < out_nodes.size(); node++) { auto& numaNode = out_nodes[node]; auto it = numaNode.cores.begin(); for ( ; it != numaNode.cores.end(); ) { if (it->threadIds.size() == 0) numaNode.cores.erase(it); else ++it; } } #else #error Unsupported platform #endif }
col4 *sample_t2d(tex2d tex, u32 x, u32 y) { SWR_ASSERT(x < tex.width); SWR_ASSERT(y < tex.height); return tex.texels + y * tex.width + x; }
HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, uint32_t renderTargetArrayIndex) { uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); HotTileSet& tile = mHotTiles[x][y]; HOTTILE& hotTile = tile.Attachment[attachment]; if (hotTile.pBuffer == NULL) { if (create) { uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; hotTile.renderTargetArrayIndex = renderTargetArrayIndex; } else { return NULL; } } else { // free the old tile and create a new one with enough space to hold all samples if (numSamples > hotTile.numSamples) { // tile should be either uninitialized or resolved if we're deleting and switching to a // new sample count SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) || (hotTile.state == HOTTILE_CLEAR)); FreeHotTileMem(hotTile.pBuffer); uint32_t size = numSamples * mHotTileSize[attachment]; uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE); hotTile.state = HOTTILE_INVALID; hotTile.numSamples = numSamples; } // if requested render target array index isn't currently loaded, need to store out the // current hottile and load the requested array slice if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) { SWR_FORMAT format; switch (attachment) { case SWR_ATTACHMENT_COLOR0: case SWR_ATTACHMENT_COLOR1: case SWR_ATTACHMENT_COLOR2: case SWR_ATTACHMENT_COLOR3: case SWR_ATTACHMENT_COLOR4: case SWR_ATTACHMENT_COLOR5: case SWR_ATTACHMENT_COLOR6: case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; } if (hotTile.state == HOTTILE_CLEAR) { if (attachment == SWR_ATTACHMENT_STENCIL) ClearStencilHotTile(&hotTile); else if (attachment == SWR_ATTACHMENT_DEPTH) ClearDepthHotTile(&hotTile); else ClearColorHotTile(&hotTile); hotTile.state = HOTTILE_DIRTY; } if (hotTile.state == HOTTILE_DIRTY) { pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); } pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment, x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); hotTile.renderTargetArrayIndex = renderTargetArrayIndex; hotTile.state = HOTTILE_DIRTY; } } return &tile.Attachment[attachment]; }
void pop_back() { SWR_ASSERT(this->mSize > 0); --this->mSize; }
////////////////////////////////////////////////////////////////////////// /// @brief DrawIndexedInstanced /// @param hContext - Handle passed back from SwrCreateContext /// @param topology - Specifies topology for draw. /// @param numIndices - Number of indices to read sequentially from index buffer. /// @param indexOffset - Starting index into index buffer. /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed. /// @param numInstances - Number of instances to render. /// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data) void DrawIndexedInstance( HANDLE hContext, PRIMITIVE_TOPOLOGY topology, uint32_t numIndices, uint32_t indexOffset, int32_t baseVertex, uint32_t numInstances = 1, uint32_t startInstance = 0) { RDTSC_START(APIDrawIndexed); SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); API_STATE* pState = &pDC->pState->state; int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); int32_t remainingIndices = numIndices; uint32_t indexSize = 0; switch (pState->indexBuffer.format) { case R32_UINT: indexSize = sizeof(uint32_t); break; case R16_UINT: indexSize = sizeof(uint16_t); break; case R8_UINT: indexSize = sizeof(uint8_t); break; default: SWR_ASSERT(0); } int draw = 0; uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices; pIB += (uint64_t)indexOffset * (uint64_t)indexSize; pState->topology = topology; pState->forceFront = false; // disable culling for points/lines uint32_t oldCullMode = pState->rastState.cullMode; if (topology == TOP_POINT_LIST) { pState->rastState.cullMode = SWR_CULLMODE_NONE; pState->forceFront = true; } while (remainingIndices) { uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw; // When breaking up draw, we need to obtain new draw context for each iteration. bool isSplitDraw = (draw > 0) ? true : false; pDC = GetDrawContext(pContext, isSplitDraw); InitDraw(pDC, isSplitDraw); pDC->FeWork.type = DRAW; pDC->FeWork.pfnWork = GetFEDrawFunc( true, // IsIndexed pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pDC->pState->pfnProcessPrims != nullptr); pDC->FeWork.desc.draw.pDC = pDC; pDC->FeWork.desc.draw.numIndices = numIndicesForDraw; pDC->FeWork.desc.draw.pIB = (int*)pIB; pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format; pDC->FeWork.desc.draw.numInstances = numInstances; pDC->FeWork.desc.draw.startInstance = startInstance; pDC->FeWork.desc.draw.baseVertex = baseVertex; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; //enqueue DC QueueDraw(pContext); pIB += maxIndicesPerDraw * indexSize; remainingIndices -= numIndicesForDraw; draw++; } // restore culling state pDC = GetDrawContext(pContext); pDC->pState->state.rastState.cullMode = oldCullMode; RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0); }
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData) { SWR_CONTEXT* pContext = pDC->pContext; HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; if (KNOB_FAST_CLEAR) { CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount; uint32_t numSamples = GetNumSamples(sampleCount); SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason. RDTSC_BEGIN(BEClear, pDC->drawId); if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) { unsigned long rt = 0; uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex); // All we want to do here is to mark the hot tile as being in a "needs clear" state. pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]); pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]); pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]); pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]); pHotTile->state = HOTTILE_CLEAR; } } if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) { HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex); pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth; pHotTile->state = HOTTILE_CLEAR; } if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) { HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex); pHotTile->clearData[0] = pClear->clearStencil; pHotTile->state = HOTTILE_CLEAR; } RDTSC_END(BEClear, 1); } else { // Legacy clear CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData; RDTSC_BEGIN(BEClear, pDC->drawId); if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR) { uint32_t clearData[4]; clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]); clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]); clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]); clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]); PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT]; SWR_ASSERT(pfnClearTiles != nullptr); unsigned long rt = 0; uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR; while (_BitScanForward(&rt, mask)) { mask &= ~(1 << rt); pfnClearTiles(pDC, hWorkerPrivateData, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); } } if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT) { uint32_t clearData[4]; clearData[0] = *(uint32_t*)&pClear->clearDepth; PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT]; SWR_ASSERT(pfnClearTiles != nullptr); pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); } if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT) { uint32_t clearData[4]; clearData[0] = pClear->clearStencil; PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT]; pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect); } RDTSC_END(BEClear, 1); } }