Example #1
0
//////////////////////////////////////////////////////////////////////////
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
HANDLE SwrCreateContext(
    const SWR_CREATECONTEXT_INFO* pCreateInfo)
{
    RDTSC_RESET();
    RDTSC_INIT(0);

    void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
    memset(pContextMem, 0, sizeof(SWR_CONTEXT));
    SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();

    pContext->driverType = pCreateInfo->driver;
    pContext->privateStateSize = pCreateInfo->privateStateSize;

    pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
    memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);

    pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
    memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);

    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
    {
        pContext->dcRing[dc].arena.Init();
        pContext->dcRing[dc].inUse = false;
        pContext->dcRing[dc].pTileMgr = new MacroTileMgr();
        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.

        pContext->dsRing[dc].arena.Init();
    }

    if (!KNOB_SINGLE_THREADED)
    {
        memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
        memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
        new (&pContext->WaitLock) std::mutex();
        new (&pContext->FifosNotEmpty) std::condition_variable();

        CreateThreadPool(pContext, &pContext->threadPool);
    }

    // Calling createThreadPool() above can set SINGLE_THREADED
    if (KNOB_SINGLE_THREADED)
    {
        pContext->NumWorkerThreads = 1;
    }

    // Allocate scratch space for workers.
    ///@note We could lazily allocate this but its rather small amount of memory.
    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
    {
        ///@todo Use numa API for allocations using numa information from thread data (if exists).
        pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
    }

    pContext->LastRetiredId = 0;
    pContext->nextDrawId = 1;

    // workers start at draw 1
    for (uint32_t i = 0; i < KNOB_MAX_NUM_THREADS; ++i)
    {
        pContext->WorkerFE[i] = 1;
        pContext->WorkerBE[i] = 1;
    }

    pContext->DrawEnqueued = 1;

    // State setup AFTER context is fully initialized
    SetupDefaultState(pContext);

    // initialize hot tile manager
    pContext->pHotTileMgr = new HotTileMgr();

    // initialize function pointer tables
    InitClearTilesTable();

    // initialize store tiles function
    pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
    pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
    pContext->pfnClearTile = pCreateInfo->pfnClearTile;

    return (HANDLE)pContext;
}
Example #2
0
DWORD workerThreadMain(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);

    {
        char threadName[64];
        sprintf_s(threadName,
#if defined(_WIN32)
                  "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
#else
                  // linux pthread name limited to 16 chars (including \0)
                  "w%03d-n%d-c%03d-t%d",
#endif
            workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
        SetCurrentThreadName(threadName);
    }

    RDTSC_INIT(threadId);

    // Only need offset numa index from base for correct masking
    uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    TileSet lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);

    auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };

    uint32_t curDrawBE = 0;
    uint32_t curDrawFE = 0;

    bool bShutdown = false;

    while (true)
    {
        if (bShutdown && !threadHasWork(curDrawBE))
        {
            break;
        }

        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
        {
            _mm_pause();
        }

        if (!threadHasWork(curDrawBE))
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (threadHasWork(curDrawBE))
            {
                lock.unlock();
                continue;
            }

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();
        }

        if (IsBEThread)
        {
            RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
            RDTSC_END(WorkerWorkOnFifoBE, 0);

            WorkOnCompute(pContext, workerId, curDrawBE);
        }

        if (IsFEThread)
        {
            WorkOnFifoFE(pContext, workerId, curDrawFE);

            if (!IsBEThread)
            {
                curDrawBE = curDrawFE;
            }
        }
    }

    return 0;
}
Example #3
0
DWORD workerThread(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(threadId, pThreadData->procGroupId); 

    RDTSC_INIT(threadId);

    int numaNode = (int)pThreadData->numaId;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    std::unordered_set<uint32_t> lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
    while (pContext->threadPool.inThreadShutdown == false)
    {
        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued)
        {
            _mm_pause();
        }

        if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued)
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued)
            {
                lock.unlock();
                continue;
            }

            if (pContext->threadPool.inThreadShutdown)
            {
                lock.unlock();
                break;
            }

            RDTSC_START(WorkerWaitForThreadEvent);

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();

            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);

            if (pContext->threadPool.inThreadShutdown)
            {
                break;
            }
        }

        RDTSC_START(WorkerWorkOnFifoBE);
        WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles);
        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);

        WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]);

        WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode);
    }

    return 0;
}