Example #1
0
HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT*                pContext,
                                      DRAW_CONTEXT*               pDC,
                                      uint32_t                    macroID,
                                      SWR_RENDERTARGET_ATTACHMENT attachment,
                                      bool                        create,
                                      uint32_t                    numSamples)
{
    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);

    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);

    HotTileSet& tile    = mHotTiles[x][y];
    HOTTILE&    hotTile = tile.Attachment[attachment];
    if (hotTile.pBuffer == NULL)
    {
        if (create)
        {
            uint32_t size                  = numSamples * mHotTileSize[attachment];
            hotTile.pBuffer                = (uint8_t*)AlignedMalloc(size, 64);
            hotTile.state                  = HOTTILE_INVALID;
            hotTile.numSamples             = numSamples;
            hotTile.renderTargetArrayIndex = 0;
        }
        else
        {
            return NULL;
        }
    }

    return &hotTile;
}
//////////////////////////////////////////////////////////////////////////
/// @brief Writes clear color to every pixel of a render surface
/// @param hPrivateContext - Handle to private DC
/// @param renderTargetIndex - Index to destination render target
/// @param x, y - Coordinates to raster tile.
/// @param pClearColor - Pointer to clear color
void StoreHotTileClear(
    SWR_SURFACE_STATE *pDstSurface,
    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
    UINT x,
    UINT y,
    const float* pClearColor)
{
    PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL;

    SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL);  ///@todo Not supported yet.

    if (renderTargetIndex != SWR_ATTACHMENT_DEPTH)
    {
        pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format];
    }
    else
    {
        pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format];
    }

    SWR_ASSERT(pfnStoreTilesClear != NULL);

    // Store a macro tile.
    /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress.
    if (pfnStoreTilesClear != NULL)
    {
        pfnStoreTilesClear(pClearColor, pDstSurface, x, y);
    }
}
Example #3
0
void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork)
{
    // Should not enqueue more then what we have backing for in the hot tile manager.
    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);

    if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1)))
    {
        return;
    }

    uint32_t id = getTileId(x, y);

    if (id >= mTiles.size())
    {
        mTiles.resize((16 + id) * 2);
    }

    MacroTileQueue* pTile = mTiles[id];
    if (!pTile)
    {
        pTile = mTiles[id] = new MacroTileQueue();
    }
    pTile->mWorkItemsFE++;
    pTile->mId = id;

    if (pTile->mWorkItemsFE == 1)
    {
        pTile->clear(mArena);
        mDirtyTiles.push_back(pTile);
    }

    mWorkItemsProduced++;
    pTile->enqueue_try_nosync(mArena, pWork);
}
Example #4
0
API_STATE* GetDrawState(SWR_CONTEXT *pContext)
{
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    SWR_ASSERT(pDC->pState != nullptr);

    return &pDC->pState->state;
}
Example #5
0
// inlined-only version
INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, DRAW_CONTEXT* pDC)
{
    int32_t result = static_cast<int32_t>(InterlockedDecrement(&pDC->threadsDone));
    SWR_ASSERT(result >= 0);

    AR_FLUSH(pDC->drawId);

    if (result == 0)
    {
        ExecuteCallbacks(pContext, workerId, pDC);

        // Cleanup memory allocations
        pDC->pArena->Reset(true);
        if (!pDC->isCompute)
        {
            pDC->pTileMgr->initialize();
        }
        if (pDC->cleanupState)
        {
            pDC->pState->pArena->Reset(true);
        }

        _ReadWriteBarrier();

        pContext->dcRing.Dequeue();  // Remove from tail
    }

    return result;
}
Example #6
0
INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
    int64_t result = InterlockedDecrement64(&pDC->threadsDone);
    SWR_ASSERT(result >= 0);

    if (result == 0)
    {
        // Cleanup memory allocations
        pDC->pArena->Reset(true);
        if (!pDC->isCompute)
        {
            pDC->pTileMgr->initialize();
        }
        if (pDC->cleanupState)
        {
            pDC->pState->pArena->Reset(true);
        }

        _ReadWriteBarrier();

        pContext->dcRing.Dequeue();  // Remove from tail
    }

    return result;
}
Example #7
0
void MacroTileMgr::markTileComplete(uint32_t id)
{
    SWR_ASSERT(mTiles.size() > id);
    MacroTileQueue& tile     = *mTiles[id];
    uint32_t        numTiles = tile.mWorkItemsFE;
    InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);

    _ReadWriteBarrier();
    tile.mWorkItemsBE += numTiles;
    SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);

    // clear out tile, but defer fifo clear until the next DC first queues to it.
    // this prevents worker threads from constantly locking a completed macro tile
    tile.mWorkItemsFE = 0;
    tile.mWorkItemsBE = 0;
}
Example #8
0
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
    SWR_RENDERTARGET_ATTACHMENT attachment)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BEStoreTiles, pDC->drawId);

    SWR_FORMAT srcFormat;
    switch (attachment)
    {
    case SWR_ATTACHMENT_COLOR0:
    case SWR_ATTACHMENT_COLOR1:
    case SWR_ATTACHMENT_COLOR2:
    case SWR_ATTACHMENT_COLOR3:
    case SWR_ATTACHMENT_COLOR4:
    case SWR_ATTACHMENT_COLOR5:
    case SWR_ATTACHMENT_COLOR6:
    case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
    case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
    default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
    }

    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroTile, x, y);

    // Only need to store the hottile if it's been rendered to...
    HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
    if (pHotTile)
    {
        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
        if (pHotTile->state == HOTTILE_CLEAR)
        {
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
            SWR_ASSERT(pfnClearTiles != nullptr);

            pfnClearTiles(pDC, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
        }

        if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
        {
            int32_t destX = KNOB_MACROTILE_X_DIM * x;
            int32_t destY = KNOB_MACROTILE_Y_DIM * y;

            pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
                attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
        }
        

        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
        {
            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
            {
                pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
            }
        }
    }
    AR_END(BEStoreTiles, 1);
}
Example #9
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any compute work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
void WorkOnCompute(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    volatile uint64_t& curDrawBE)
{
    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
    {
        return;
    }

    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;

    DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
    if (pDC->isCompute == false) return;

    // check dependencies
    if (CheckDependency(pContext, pDC, lastRetiredDraw))
    {
        return;
    }

    SWR_ASSERT(pDC->pDispatch != nullptr);
    DispatchQueue& queue = *pDC->pDispatch;

    // Is there any work remaining?
    if (queue.getNumQueued() > 0)
    {
        bool lastToComplete = false;

        uint32_t threadGroupId = 0;
        while (queue.getWork(threadGroupId))
        {
            ProcessComputeBE(pDC, workerId, threadGroupId);

            lastToComplete = queue.finishedWork();
        }

        _ReadWriteBarrier();

        if (lastToComplete)
        {
            SWR_ASSERT(queue.isWorkComplete() == true);
            pDC->doneCompute = true;
        }
    }
}
Example #10
0
void SwrSetBlendFunc(
    HANDLE hContext,
    uint32_t renderTarget,
    PFN_BLEND_JIT_FUNC pfnBlendFunc)
{
    SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
    API_STATE *pState = GetDrawState(GetContext(hContext));
    pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
}
Example #11
0
void SwrSetSoBuffers(
    HANDLE hContext,
    SWR_STREAMOUT_BUFFER* pSoBuffer,
    uint32_t slot)
{
    API_STATE* pState = GetDrawState(GetContext(hContext));

    SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);

    pState->soBuffer[slot] = *pSoBuffer;
}
Example #12
0
void SwrSetScissorRects(
    HANDLE hContext,
    uint32_t numScissors,
    const BBOX* pScissors)
{
    SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
        "Invalid number of scissor rects.");

    API_STATE* pState = GetDrawState(GetContext(hContext));
    memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
};
Example #13
0
void SwrSetSoFunc(
    HANDLE hContext,
    PFN_SO_FUNC    pfnSoFunc,
    uint32_t streamIndex)
{
    API_STATE* pState = GetDrawState(GetContext(hContext));

    SWR_ASSERT(streamIndex < MAX_SO_STREAMS);

    pState->pfnSoFunc[streamIndex] = pfnSoFunc;
}
Example #14
0
void SwrSetViewports(
    HANDLE hContext,
    uint32_t numViewports,
    const SWR_VIEWPORT* pViewports,
    const SWR_VIEWPORT_MATRIX* pMatrices)
{
    SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
        "Invalid number of viewports.");

    SWR_CONTEXT *pContext = GetContext(hContext);
    API_STATE* pState = GetDrawState(pContext);

    memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);

    if (pMatrices != nullptr)
    {
        memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
    }
    else
    {
        // Compute default viewport transform.
        for (uint32_t i = 0; i < numViewports; ++i)
        {
            if (pContext->driverType == DX)
            {
                pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
                pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
                pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
                pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
                pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
                pState->vpMatrix[i].m32 = pState->vp[i].minZ;
            }
            else
            {
                // Standard, with the exception that Y is inverted.
                pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
                pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
                pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
                pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
                pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
                pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;

                // Now that the matrix is calculated, clip the view coords to screen size.
                // OpenGL allows for -ve x,y in the viewport.
                pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
                pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
            }
        }
    }

    updateGuardband(pState);
}
Example #15
0
inline int inside(const float v[4])
{
    switch (ClippingPlane)
    {
    case FRUSTUM_LEFT   : return (v[0]>=-v[3]);
    case FRUSTUM_RIGHT  : return (v[0]<= v[3]);
    case FRUSTUM_TOP    : return (v[1]>=-v[3]);
    case FRUSTUM_BOTTOM : return (v[1]<= v[3]);
    case FRUSTUM_NEAR   : return (v[2]>=0.0f);
    case FRUSTUM_FAR    : return (v[2]<= v[3]);
    default:
        SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
        return 0;
    }
}
Example #16
0
inline void intersect(
    int s,                       // index to first edge vertex v0 in pInPts.
    int p,                       // index to second edge vertex v1 in pInPts.
    const float *pInPts,         // array of all the input positions.
    const float *pInAttribs,     // array of all attributes for all vertex. All the attributes for each vertex is contiguous.
    int numInAttribs,            // number of attributes per vertex.
    int i,                       // output index.
    float *pOutPts,              // array of output positions. We'll write our new intersection point at i*4.
    float *pOutAttribs)          // array of output attributes. We'll write our new attributes at i*numInAttribs.
{
    float t;

    // Find the parameter of the intersection.
    //        t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
    const float *v1 = &pInPts[s*4];
    const float *v2 = &pInPts[p*4];

    switch (ClippingPlane)
    {
    case FRUSTUM_LEFT:      t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break;
    case FRUSTUM_RIGHT:     t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break;
    case FRUSTUM_TOP:       t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break;
    case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break;
    case FRUSTUM_NEAR:      t = ComputeInterpFactor(v1[2], v2[2]); break;
    case FRUSTUM_FAR:       t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break;
    default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
    };


    const float *a1 = &pInAttribs[s*numInAttribs];
    const float *a2 = &pInAttribs[p*numInAttribs];

    float *pOutP    = &pOutPts[i*4];
    float *pOutA    = &pOutAttribs[i*numInAttribs];

    // Interpolate new position.
    for(int j = 0; j < 4; ++j)
    {
        pOutP[j] = v1[j] + (v2[j]-v1[j])*t;
    }

    // Interpolate Attributes
    for(int attr = 0; attr < numInAttribs; ++attr)
    {
        pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t;
    }
}
Example #17
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any compute work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
void WorkOnCompute(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint32_t& curDrawBE)
{
    uint32_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
    {
        return;
    }

    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;

    for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
        if (pDC->isCompute == false) return;

        // check dependencies
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return;
        }

        SWR_ASSERT(pDC->pDispatch != nullptr);
        DispatchQueue& queue = *pDC->pDispatch;

        // Is there any work remaining?
        if (queue.getNumQueued() > 0)
        {
            void* pSpillFillBuffer = nullptr;
            void* pScratchSpace = nullptr;
            uint32_t threadGroupId = 0;
            while (queue.getWork(threadGroupId))
            {
                queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
                queue.finishedWork();
            }

            // Ensure all streaming writes are globally visible before moving onto the next draw
            _mm_mfence();
        }
    }
}
Example #18
0
void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
{
    // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
    OSALIGNSIMD(float) tempPts[6 * 4];
    OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];

    // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
    int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
    NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
    NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
    NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
    NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
    NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);

    SWR_ASSERT(NumOutPts <= 6);

    *numVerts = NumOutPts;
    return;
}
Example #19
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any compute work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
void WorkOnCompute(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint64_t& curDrawBE)
{
    uint64_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
    {
        return;
    }

    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;

    for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
        if (pDC->isCompute == false) return;

        // check dependencies
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return;
        }

        SWR_ASSERT(pDC->pDispatch != nullptr);
        DispatchQueue& queue = *pDC->pDispatch;

        // Is there any work remaining?
        if (queue.getNumQueued() > 0)
        {
            void* pSpillFillBuffer = nullptr;
            uint32_t threadGroupId = 0;
            while (queue.getWork(threadGroupId))
            {
                ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);

                queue.finishedWork();
            }
        }
    }
}
Example #20
0
//////////////////////////////////////////////////////////////////////////
/// @brief Process compute work.
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
{
    SWR_CONTEXT *pContext = pDC->pContext;

    AR_BEGIN(BEDispatch, pDC->drawId);

    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
    SWR_ASSERT(pTaskData != nullptr);

    // Ensure spill fill memory has been allocated.
    size_t spillFillSize = pDC->pState->state.totalSpillFillSize;
    if (spillFillSize && pSpillFillBuffer == nullptr)
    {
        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
    }
    
    size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
    if (scratchSpaceSize && pScratchSpace == nullptr)
    {
        pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
    }

    const API_STATE& state = GetApiState(pDC);

    SWR_CS_CONTEXT csContext{ 0 };
    csContext.tileCounter = threadGroupId;
    csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
    csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
    csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
    csContext.pTGSM = pContext->ppScratch[workerId];
    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
    csContext.pScratchSpace = (uint8_t*)pScratchSpace;
    csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;

    state.pfnCsFunc(GetPrivateState(pDC), &csContext);

    UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);

    AR_END(BEDispatch, 1);
}
Example #21
0
    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
    {
        const float* pSrc = (const float*)entry;
        float* pDst = (float*)&mCurBlock[mTail];

        auto lambda = [&](int32_t i)
        {
            __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
            _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
        };
            
        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
        static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
            "FIFO element size should be multiple of SIMD width.");

        UnrollerL<0, numSimdLines, 1>::step(lambda);

        mTail ++;
        if (mTail == mBlockSize)
        {
            if (++mCurBlockIdx < mBlocks.size())
            {
                mCurBlock = mBlocks[mCurBlockIdx];
            }
            else
            {
                T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
                SWR_ASSERT(newBlock);

                mBlocks.push_back(newBlock);
                mCurBlock = newBlock;
            }

            mTail = 0;
        }

        mNumEntries ++;
        return true;
    }
Example #22
0
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
{
    RDTSC_START(APIGetDrawContext);
    // If current draw context is null then need to obtain a new draw context to use from ring.
    if (pContext->pCurDrawContext == nullptr)
    {
        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;

        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
        pContext->pCurDrawContext = pCurDrawContext;

        // Update LastRetiredId
        UpdateLastRetiredId(pContext);

        // Need to wait until this draw context is available to use.
        while (StillDrawing(pContext, pCurDrawContext))
        {
            // Make sure workers are working.
            WakeAllThreads(pContext);

            _mm_pause();
        }

        // Assign next available entry in DS ring to this DC.
        uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
        pCurDrawContext->pState = &pContext->dsRing[dsIndex];

        Arena& stateArena = pCurDrawContext->pState->arena;

        // Copy previous state to current state.
        if (pContext->pPrevDrawContext)
        {
            DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;

            // If we're splitting our draw then we can just use the same state from the previous
            // draw. In this case, we won't increment the DS ring index so the next non-split
            // draw can receive the state.
            if (isSplitDraw == false)
            {
                CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);

                stateArena.Reset();    // Reset memory.

                // Copy private state to new context.
                if (pPrevDrawContext->pState->pPrivateState != nullptr)
                {
                    pCurDrawContext->pState->pPrivateState = stateArena.AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
                    memcpy(pCurDrawContext->pState->pPrivateState, pPrevDrawContext->pState->pPrivateState, pContext->privateStateSize);
                }

                pContext->curStateId++;  // Progress state ring index forward.
            }
            else
            {
                // If its a split draw then just copy the state pointer over
                // since its the same draw.
                pCurDrawContext->pState = pPrevDrawContext->pState;
            }
        }
        else
        {
            stateArena.Reset();    // Reset memory.
            pContext->curStateId++;  // Progress state ring index forward.
        }

        pCurDrawContext->dependency = 0;
        pCurDrawContext->arena.Reset();
        pCurDrawContext->pContext = pContext;
        pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
        pCurDrawContext->inUse = false;

        pCurDrawContext->doneCompute = false;
        pCurDrawContext->doneFE = false;
        pCurDrawContext->FeLock = 0;

        pCurDrawContext->pTileMgr->initialize();

        // Assign unique drawId for this DC
        pCurDrawContext->drawId = pContext->nextDrawId++;
    }
    else
    {
        SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
    }

    RDTSC_STOP(APIGetDrawContext, 0, 0);
    return pContext->pCurDrawContext;
}
Example #23
0
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
{
    out_nodes.clear();
    out_numThreadsPerProcGroup = 0;

#if defined(_WIN32)

    std::vector<KAFFINITY> threadMaskPerProcGroup;

    static std::mutex m;
    std::lock_guard<std::mutex> l(m);

    DWORD bufSize = 0;

    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
    SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
    SWR_ASSERT(pBufferMem);

    ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");

    uint32_t count = bufSize / pBufferMem->Size;
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;

    for (uint32_t i = 0; i < count; ++i)
    {
        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
        {
            auto& gmask = pBuffer->Processor.GroupMask[g];
            uint32_t threadId = 0;
            uint32_t procGroup = gmask.Group;

            Core* pCore = nullptr;

            uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);

            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
            {
                // clear mask
                KAFFINITY threadMask = KAFFINITY(1) << threadId;
                gmask.Mask &= ~threadMask;

                if (procGroup >= threadMaskPerProcGroup.size())
                {
                    threadMaskPerProcGroup.resize(procGroup + 1);
                }

                if (threadMaskPerProcGroup[procGroup] & threadMask)
                {
                    // Already seen this mask.  This means that we are in 32-bit mode and
                    // have seen more than 32 HW threads for this procGroup
                    // Don't use it
#if defined(_WIN64)
                    SWR_INVALID("Shouldn't get here in 64-bit mode");
#endif
                    continue;
                }

                threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);

                // Find Numa Node
                uint32_t numaId = 0;
                PROCESSOR_NUMBER procNum = {};
                procNum.Group = WORD(procGroup);
                procNum.Number = UCHAR(threadId);

                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                SWR_ASSERT(ret);

                // Store data
                if (out_nodes.size() <= numaId)
                {
                    out_nodes.resize(numaId + 1);
                }
                auto& numaNode = out_nodes[numaId];
                numaNode.numaId = numaId;

                uint32_t coreId = 0;

                if (nullptr == pCore)
                {
                    numaNode.cores.push_back(Core());
                    pCore = &numaNode.cores.back();
                    pCore->procGroup = procGroup;
                }
                pCore->threadIds.push_back(threadId);
                if (procGroup == 0)
                {
                    out_numThreadsPerProcGroup++;
                }
            }
        }
        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
    }

    free(pBufferMem);


#elif defined(__linux__) || defined (__gnu_linux__)

    // Parse /proc/cpuinfo to get full topology
    std::ifstream input("/proc/cpuinfo");
    std::string line;
    char* c;
    uint32_t procId = uint32_t(-1);
    uint32_t coreId = uint32_t(-1);
    uint32_t physId = uint32_t(-1);

    while (std::getline(input, line))
    {
        if (line.find("processor") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            procId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("core id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("physical id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            physId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.length() == 0)
        {
            if (physId + 1 > out_nodes.size())
                out_nodes.resize(physId + 1);
            auto& numaNode = out_nodes[physId];
            numaNode.numaId = physId;

            if (coreId + 1 > numaNode.cores.size())
                numaNode.cores.resize(coreId + 1);
            auto& core = numaNode.cores[coreId];
            core.procGroup = coreId;
            core.threadIds.push_back(procId);
        }
    }

    out_numThreadsPerProcGroup = 0;
    for (auto &node : out_nodes)
    {
        for (auto &core : node.cores)
        {
            out_numThreadsPerProcGroup += core.threadIds.size();
        }
    }

#elif defined(__APPLE__)

    auto numProcessors = 0;
    auto numCores = 0;
    auto numPhysicalIds = 0;

    int value;
    size_t size = sizeof(value);

    int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
    SWR_ASSERT(result == 0);
    numPhysicalIds = value;

    result = sysctlbyname("hw.logicalcpu", &value, &size, NULL, 0);
    SWR_ASSERT(result == 0);
    numProcessors = value;

    result = sysctlbyname("hw.physicalcpu", &value, &size, NULL, 0);
    SWR_ASSERT(result == 0);
    numCores = value;

    out_nodes.resize(numPhysicalIds);

    for (auto physId = 0; physId < numPhysicalIds; ++physId)
    {
        auto &numaNode = out_nodes[physId];
        auto procId = 0;

        numaNode.cores.resize(numCores);

        while (procId < numProcessors)
        {
            for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
            {
                auto &core = numaNode.cores[coreId];

                core.procGroup = coreId;
                core.threadIds.push_back(procId);
            }
        }
    }

    out_numThreadsPerProcGroup = 0;

    for (auto &node : out_nodes)
    {
        for (auto &core : node.cores)
        {
            out_numThreadsPerProcGroup += core.threadIds.size();
        }
    }

#else

#error Unsupported platform

#endif

    // Prune empty cores and numa nodes
    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
    {
        // Erase empty cores (first)
        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
        {
            if (core_it->threadIds.size() == 0)
            {
                core_it = node_it->cores.erase(core_it);
            }
            else
            {
                ++core_it;
            }
        }

        // Erase empty numa nodes (second)
        if (node_it->cores.size() == 0)
        {
            node_it = out_nodes.erase(node_it);
        }
        else
        {
            ++node_it;
        }
    }
}
Example #24
0
//////////////////////////////////////////////////////////////////////////
/// @brief If there is any BE work then go work on it.
/// @param pContext - pointer to SWR context.
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
///                    has its own curDrawBE counter and this ensures that each worker processes all the
///                    draws in order.
/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
///                      own set and each time it fails to lock a macrotile, because its already locked,
///                      then it will add that tile to the lockedTiles set. As a worker begins to work
///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
///                      still have work pending in a previous draw. Additionally, the lockedTiles is
///                      hueristic that can steer a worker back to the same macrotile that it had been
///                      working on in a previous draw.
/// @returns        true if worker thread should shutdown
bool WorkOnFifoBE(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint32_t &curDrawBE,
    TileSet& lockedTiles,
    uint32_t numaNode,
    uint32_t numaMask)
{
    bool bShutdown = false;

    // Find the first incomplete draw that has pending work. If no such draw is found then
    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
    uint32_t drawEnqueued = 0;
    if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
    {
        return false;
    }

    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;

    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
    lockedTiles.clear();

    // Try to work on each draw in order of the available draws in flight.
    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
    //      working on those macrotiles that are known to be complete in the prior draw to
    //      maintain order. The locked tiles provides the history to ensures this.
    for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
    {
        DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];

        if (pDC->isCompute) return false; // We don't look at compute work.

        // First wait for FE to be finished with this draw. This keeps threading model simple
        // but if there are lots of bubbles between draws then serializing FE and BE may
        // need to be revisited.
        if (!pDC->doneFE) return false;
        
        // If this draw is dependent on a previous draw then we need to bail.
        if (CheckDependency(pContext, pDC, lastRetiredDraw))
        {
            return false;
        }

        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
        auto &macroTiles = pDC->pTileMgr->getDirtyTiles();

        for (auto tile : macroTiles)
        {
            uint32_t tileID = tile->mId;

            // Only work on tiles for this numa node
            uint32_t x, y;
            pDC->pTileMgr->getTileIndices(tileID, x, y);
            if (((x ^ y) & numaMask) != numaNode)
            {
                continue;
            }

            if (!tile->getNumQueued())
            {
                continue;
            }

            // can only work on this draw if it's not in use by other threads
            if (lockedTiles.find(tileID) != lockedTiles.end())
            {
                continue;
            }

            if (tile->tryLock())
            {
                BE_WORK *pWork;

                RDTSC_BEGIN(WorkerFoundWork, pDC->drawId);

                uint32_t numWorkItems = tile->getNumQueued();
                SWR_ASSERT(numWorkItems);

                pWork = tile->peek();
                SWR_ASSERT(pWork);
                if (pWork->type == DRAW)
                {
                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, workerId, tileID);
                }
                else if (pWork->type == SHUTDOWN)
                {
                    bShutdown = true;
                }

                while ((pWork = tile->peek()) != nullptr)
                {
                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                    tile->dequeue();
                }
                RDTSC_END(WorkerFoundWork, numWorkItems);

                _ReadWriteBarrier();

                pDC->pTileMgr->markTileComplete(tileID);

                // Optimization: If the draw is complete and we're the last one to have worked on it then
                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
                if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
                {
                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
                    curDrawBE++;
                    CompleteDrawContextInl(pContext, workerId, pDC);

                    lastRetiredDraw++;

                    lockedTiles.clear();
                    break;
                }

                if (bShutdown)
                {
                    break;
                }
            }
            else
            {
                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
                lockedTiles.insert(tileID);
            }
        }
    }

    return bShutdown;
}
Example #25
0
void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
{
    out_nodes.clear();
    out_numThreadsPerProcGroup = 0;

#if defined(_WIN32)

    static std::mutex m;
    std::lock_guard<std::mutex> l(m);

    static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
    DWORD bufSize = sizeof(buffer);

    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");

    uint32_t count = bufSize / buffer->Size;
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;

    for (uint32_t i = 0; i < count; ++i)
    {
        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
        {
            auto& gmask = pBuffer->Processor.GroupMask[g];
            uint32_t threadId = 0;
            uint32_t procGroup = gmask.Group;

            Core* pCore = nullptr;

            uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);

            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
            {
                // clear mask
                gmask.Mask &= ~(KAFFINITY(1) << threadId);

                // Find Numa Node
                PROCESSOR_NUMBER procNum = {};
                procNum.Group = WORD(procGroup);
                procNum.Number = UCHAR(threadId);

                uint32_t numaId = 0;
                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                SWR_ASSERT(ret);

                // Store data
                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
                auto& numaNode = out_nodes[numaId];

                uint32_t coreId = 0;

                if (nullptr == pCore)
                {
                    numaNode.cores.push_back(Core());
                    pCore = &numaNode.cores.back();
                    pCore->procGroup = procGroup;
#if !defined(_WIN64)
                    coreId = (uint32_t)numaNode.cores.size();
                    if ((coreId * numThreads) >= 32)
                    {
                        // Windows doesn't return threadIds >= 32 for a processor group correctly
                        // when running a 32-bit application.
                        // Just save -1 as the threadId
                        threadId = uint32_t(-1);
                    }
#endif
                }
                pCore->threadIds.push_back(threadId);
                if (procGroup == 0)
                {
                    out_numThreadsPerProcGroup++;
                }
            }
        }
        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
    }


#elif defined(__linux__) || defined (__gnu_linux__)

    // Parse /proc/cpuinfo to get full topology
    std::ifstream input("/proc/cpuinfo");
    std::string line;
    char* c;
    uint32_t threadId = uint32_t(-1);
    uint32_t coreId = uint32_t(-1);
    uint32_t numaId = uint32_t(-1);

    while (std::getline(input, line))
    {
        if (line.find("processor") != std::string::npos)
        {
            if (threadId != uint32_t(-1))
            {
                // Save information.
                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
                auto& numaNode = out_nodes[numaId];
                if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
                auto& core = numaNode.cores[coreId];

                core.procGroup = coreId;
                core.threadIds.push_back(threadId);

                out_numThreadsPerProcGroup++;
            }

            auto data_start = line.find(": ") + 2;
            threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("core id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
        if (line.find("physical id") != std::string::npos)
        {
            auto data_start = line.find(": ") + 2;
            numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
            continue;
        }
    }

    if (threadId != uint32_t(-1))
    {
        // Save information.
        if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
        auto& numaNode = out_nodes[numaId];
        if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
        auto& core = numaNode.cores[coreId];

        core.procGroup = coreId;
        core.threadIds.push_back(threadId);
        out_numThreadsPerProcGroup++;
    }

    for (uint32_t node = 0; node < out_nodes.size(); node++) {
        auto& numaNode = out_nodes[node];
        auto it = numaNode.cores.begin();
        for ( ; it != numaNode.cores.end(); ) {
            if (it->threadIds.size() == 0)
                numaNode.cores.erase(it);
            else
                ++it;
        }
    }

#else

#error Unsupported platform

#endif
}
Example #26
0
col4 *sample_t2d(tex2d tex, u32 x, u32 y) {
	SWR_ASSERT(x < tex.width);
	SWR_ASSERT(y < tex.height);
	return tex.texels + y * tex.width + x;
}
Example #27
0
HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT*                pContext,
                                DRAW_CONTEXT*               pDC,
                                HANDLE                      hWorkerPrivateData,
                                uint32_t                    macroID,
                                SWR_RENDERTARGET_ATTACHMENT attachment,
                                bool                        create,
                                uint32_t                    numSamples,
                                uint32_t                    renderTargetArrayIndex)
{
    uint32_t x, y;
    MacroTileMgr::getTileIndices(macroID, x, y);

    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);

    HotTileSet& tile    = mHotTiles[x][y];
    HOTTILE&    hotTile = tile.Attachment[attachment];
    if (hotTile.pBuffer == NULL)
    {
        if (create)
        {
            uint32_t size     = numSamples * mHotTileSize[attachment];
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
            hotTile.pBuffer =
                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
            hotTile.state                  = HOTTILE_INVALID;
            hotTile.numSamples             = numSamples;
            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
        }
        else
        {
            return NULL;
        }
    }
    else
    {
        // free the old tile and create a new one with enough space to hold all samples
        if (numSamples > hotTile.numSamples)
        {
            // tile should be either uninitialized or resolved if we're deleting and switching to a
            // new sample count
            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) ||
                       (hotTile.state == HOTTILE_CLEAR));
            FreeHotTileMem(hotTile.pBuffer);

            uint32_t size     = numSamples * mHotTileSize[attachment];
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
            hotTile.pBuffer =
                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
            hotTile.state      = HOTTILE_INVALID;
            hotTile.numSamples = numSamples;
        }

        // if requested render target array index isn't currently loaded, need to store out the
        // current hottile and load the requested array slice
        if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
        {
            SWR_FORMAT format;
            switch (attachment)
            {
            case SWR_ATTACHMENT_COLOR0:
            case SWR_ATTACHMENT_COLOR1:
            case SWR_ATTACHMENT_COLOR2:
            case SWR_ATTACHMENT_COLOR3:
            case SWR_ATTACHMENT_COLOR4:
            case SWR_ATTACHMENT_COLOR5:
            case SWR_ATTACHMENT_COLOR6:
            case SWR_ATTACHMENT_COLOR7:
                format = KNOB_COLOR_HOT_TILE_FORMAT;
                break;
            case SWR_ATTACHMENT_DEPTH:
                format = KNOB_DEPTH_HOT_TILE_FORMAT;
                break;
            case SWR_ATTACHMENT_STENCIL:
                format = KNOB_STENCIL_HOT_TILE_FORMAT;
                break;
            default:
                SWR_INVALID("Unknown attachment: %d", attachment);
                format = KNOB_COLOR_HOT_TILE_FORMAT;
                break;
            }

            if (hotTile.state == HOTTILE_CLEAR)
            {
                if (attachment == SWR_ATTACHMENT_STENCIL)
                    ClearStencilHotTile(&hotTile);
                else if (attachment == SWR_ATTACHMENT_DEPTH)
                    ClearDepthHotTile(&hotTile);
                else
                    ClearColorHotTile(&hotTile);

                hotTile.state = HOTTILE_DIRTY;
            }

            if (hotTile.state == HOTTILE_DIRTY)
            {
                pContext->pfnStoreTile(GetPrivateState(pDC),
                                       hWorkerPrivateData,
                                       format,
                                       attachment,
                                       x * KNOB_MACROTILE_X_DIM,
                                       y * KNOB_MACROTILE_Y_DIM,
                                       hotTile.renderTargetArrayIndex,
                                       hotTile.pBuffer);
            }

            pContext->pfnLoadTile(GetPrivateState(pDC),
                                  hWorkerPrivateData,
                                  format,
                                  attachment,
                                  x * KNOB_MACROTILE_X_DIM,
                                  y * KNOB_MACROTILE_Y_DIM,
                                  renderTargetArrayIndex,
                                  hotTile.pBuffer);

            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
            hotTile.state                  = HOTTILE_DIRTY;
        }
    }
    return &tile.Attachment[attachment];
}
Example #28
0
 void pop_back()
 {
     SWR_ASSERT(this->mSize > 0);
     --this->mSize;
 }
Example #29
0
//////////////////////////////////////////////////////////////////////////
/// @brief DrawIndexedInstanced
/// @param hContext - Handle passed back from SwrCreateContext
/// @param topology - Specifies topology for draw.
/// @param numIndices - Number of indices to read sequentially from index buffer.
/// @param indexOffset - Starting index into index buffer.
/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
/// @param numInstances - Number of instances to render.
/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
void DrawIndexedInstance(
    HANDLE hContext,
    PRIMITIVE_TOPOLOGY topology,
    uint32_t numIndices,
    uint32_t indexOffset,
    int32_t baseVertex,
    uint32_t numInstances = 1,
    uint32_t startInstance = 0)
{
    RDTSC_START(APIDrawIndexed);

    SWR_CONTEXT *pContext = GetContext(hContext);
    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
    API_STATE* pState = &pDC->pState->state;

    int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
    uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
    int32_t remainingIndices = numIndices;

    uint32_t indexSize = 0;
    switch (pState->indexBuffer.format)
    {
    case R32_UINT: indexSize = sizeof(uint32_t); break;
    case R16_UINT: indexSize = sizeof(uint16_t); break;
    case R8_UINT: indexSize = sizeof(uint8_t); break;
    default:
        SWR_ASSERT(0);
    }

    int draw = 0;
    uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
    pIB += (uint64_t)indexOffset * (uint64_t)indexSize;

    pState->topology = topology;
    pState->forceFront = false;

    // disable culling for points/lines
    uint32_t oldCullMode = pState->rastState.cullMode;
    if (topology == TOP_POINT_LIST)
    {
        pState->rastState.cullMode = SWR_CULLMODE_NONE;
        pState->forceFront = true;
    }

    while (remainingIndices)
    {
        uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
        remainingIndices : maxIndicesPerDraw;

        // When breaking up draw, we need to obtain new draw context for each iteration.
        bool isSplitDraw = (draw > 0) ? true : false;
        pDC = GetDrawContext(pContext, isSplitDraw);
        InitDraw(pDC, isSplitDraw);

        pDC->FeWork.type = DRAW;
        pDC->FeWork.pfnWork = GetFEDrawFunc(
            true,   // IsIndexed
            pState->tsState.tsEnable,
            pState->gsState.gsEnable,
            pState->soState.soEnable,
            pDC->pState->pfnProcessPrims != nullptr);
        pDC->FeWork.desc.draw.pDC = pDC;
        pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
        pDC->FeWork.desc.draw.pIB = (int*)pIB;
        pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;

        pDC->FeWork.desc.draw.numInstances = numInstances;
        pDC->FeWork.desc.draw.startInstance = startInstance;
        pDC->FeWork.desc.draw.baseVertex = baseVertex;
        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;

        //enqueue DC
        QueueDraw(pContext);

        pIB += maxIndicesPerDraw * indexSize;
        remainingIndices -= numIndicesForDraw;
        draw++;
    }

    // restore culling state
    pDC = GetDrawContext(pContext);
    pDC->pState->state.rastState.cullMode = oldCullMode;

    RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
}
Example #30
0
void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
{
    SWR_CONTEXT* pContext           = pDC->pContext;
    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

    if (KNOB_FAST_CLEAR)
    {
        CLEAR_DESC*           pClear      = (CLEAR_DESC*)pUserData;
        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
        uint32_t              numSamples  = GetNumSamples(sampleCount);

        SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.

        RDTSC_BEGIN(BEClear, pDC->drawId);

        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
        {
            unsigned long rt   = 0;
            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
            while (_BitScanForward(&rt, mask))
            {
                mask &= ~(1 << rt);

                HOTTILE* pHotTile =
                    pContext->pHotTileMgr->GetHotTile(pContext,
                                                      pDC,
                                                      hWorkerPrivateData,
                                                      macroTile,
                                                      (SWR_RENDERTARGET_ATTACHMENT)rt,
                                                      true,
                                                      numSamples,
                                                      pClear->renderTargetArrayIndex);

                // All we want to do here is to mark the hot tile as being in a "needs clear" state.
                pHotTile->clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
                pHotTile->clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
                pHotTile->clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
                pHotTile->clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);
                pHotTile->state        = HOTTILE_CLEAR;
            }
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
        {
            HOTTILE* pHotTile      = pContext->pHotTileMgr->GetHotTile(pContext,
                                                                  pDC,
                                                                  hWorkerPrivateData,
                                                                  macroTile,
                                                                  SWR_ATTACHMENT_DEPTH,
                                                                  true,
                                                                  numSamples,
                                                                  pClear->renderTargetArrayIndex);
            pHotTile->clearData[0] = *(uint32_t*)&pClear->clearDepth;
            pHotTile->state        = HOTTILE_CLEAR;
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
        {
            HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
                                                                  pDC,
                                                                  hWorkerPrivateData,
                                                                  macroTile,
                                                                  SWR_ATTACHMENT_STENCIL,
                                                                  true,
                                                                  numSamples,
                                                                  pClear->renderTargetArrayIndex);

            pHotTile->clearData[0] = pClear->clearStencil;
            pHotTile->state        = HOTTILE_CLEAR;
        }

        RDTSC_END(BEClear, 1);
    }
    else
    {
        // Legacy clear
        CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
        RDTSC_BEGIN(BEClear, pDC->drawId);

        if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
        {
            uint32_t clearData[4];
            clearData[0] = *(uint32_t*)&(pClear->clearRTColor[0]);
            clearData[1] = *(uint32_t*)&(pClear->clearRTColor[1]);
            clearData[2] = *(uint32_t*)&(pClear->clearRTColor[2]);
            clearData[3] = *(uint32_t*)&(pClear->clearRTColor[3]);

            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
            SWR_ASSERT(pfnClearTiles != nullptr);

            unsigned long rt   = 0;
            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
            while (_BitScanForward(&rt, mask))
            {
                mask &= ~(1 << rt);

                pfnClearTiles(pDC,
                              hWorkerPrivateData,
                              (SWR_RENDERTARGET_ATTACHMENT)rt,
                              macroTile,
                              pClear->renderTargetArrayIndex,
                              clearData,
                              pClear->rect);
            }
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
        {
            uint32_t clearData[4];
            clearData[0]                  = *(uint32_t*)&pClear->clearDepth;
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
            SWR_ASSERT(pfnClearTiles != nullptr);

            pfnClearTiles(pDC,
                          hWorkerPrivateData,
                          SWR_ATTACHMENT_DEPTH,
                          macroTile,
                          pClear->renderTargetArrayIndex,
                          clearData,
                          pClear->rect);
        }

        if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
        {
            uint32_t clearData[4];
            clearData[0]                  = pClear->clearStencil;
            PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];

            pfnClearTiles(pDC,
                          hWorkerPrivateData,
                          SWR_ATTACHMENT_STENCIL,
                          macroTile,
                          pClear->renderTargetArrayIndex,
                          clearData,
                          pClear->rect);
        }

        RDTSC_END(BEClear, 1);
    }
}