Beispiel #1
0
/* =============================================================================
 * threadWait
 * -- Synchronizes all threads to start/stop parallel section
 * =============================================================================
 */
static void
threadWait (void* argPtr)
{
    thread_args_t* args = (thread_args_t*) argPtr;
    long threadId = args->threadId;
    commits = &(args->commits);
    aborts = &(args->aborts);
    retriesProf = &(args->retries);
    ucbProf = &(args->ucb);

    int sz = 100;
    memoized_blocks = (memoized_choices_t*) malloc(sz * sizeof(memoized_choices_t));
    for (sz--; sz >= 0; sz-- ) {
    	memoized_choices_t* block = &(memoized_blocks[sz]);
    	block->runs = 0;
    	block->havingCapacityAborts = 0;
    	block->retries = 0;
        block->commitsHTM = 1;
        block->believedCapacity = 1;
        block->believedTransient = 1;
        block->believedGiveUp = 1;
        block->abortsCapacity = 0;
        block->abortsTransient = 0;
        block->cyclesCapacity = 100;
        block->cyclesTransient = 100;
        block->cyclesGiveUp = 100;
        block->retries = 5;
        block->lastCycles = 0;
        block->lastRetries = 5;
        block->bestEverCycles = 0;
        block->bestEverRetries = 5;
    }

	randomFallback = random_alloc();
    random_seed(randomFallback, time(NULL));

    THREAD_LOCAL_SET(global_threadId, (long)threadId);

    bindThread(threadId);

    while (1) {
        THREAD_BARRIER(global_barrierPtr, threadId); /* wait for start parallel */
        if (global_doShutdown) {
            break;
        }
        global_funcPtr(global_argPtr);
        THREAD_BARRIER(global_barrierPtr, threadId); /* wait for end parallel */
        if (threadId == 0) {
            break;
        }
    }
}
Beispiel #2
0
void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
{
    if (nullptr == pContext)
    {
        return;
    }

    if (apiThreadId >= pContext->threadPool.numReservedThreads)
    {
        if (pContext->threadPool.numReservedThreads)
        {
            const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
            // Just bind to the process group used for API thread 0
            bindThread(pContext, 0, threadData.procGroupId, true);
        }
        return;
    }

    const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];

    bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
}
Beispiel #3
0
RSSL_THREAD_DECLARE(runChannelConnectionHandler, pArg)
{

	ProviderThread *pProvThread = (ProviderThread*)pArg;

	TimeValue nextTickTime;
	RsslInt32 currentTicks = 0;

	if (pProvThread->cpuId >= 0)
	{
		if (bindThread(pProvThread->cpuId) != RSSL_RET_SUCCESS)
		{
			printf("Error: Failed to bind thread to core %d.\n", pProvThread->cpuId);
			exit(-1);
		}
	}

	nextTickTime = getTimeNano() + nsecPerTick;

	/* this is the main loop */
	while(rtrLikely(!signal_shutdown))
	{
		for (currentTicks = 0; currentTicks < providerThreadConfig.ticksPerSec; ++currentTicks)
		{
			providerThreadRead(pProvThread, nextTickTime);

			nextTickTime += nsecPerTick;

			providerThreadSendMsgBurst(pProvThread, nextTickTime);

			providerThreadReceiveNewChannels(pProvThread);

		}

		providerThreadCheckPings(pProvThread);

	}

	return RSSL_THREAD_RETURN();
}
Beispiel #4
0
static void
threadWait (void* argPtr)
{
    long threadId = *(long*)argPtr;

    THREAD_LOCAL_SET(global_threadId, (long)threadId);

    bindThread(threadId);

    thread_id = threadId;

    while (1) {
        THREAD_BARRIER(global_barrierPtr, threadId); /* wait for start parallel */
        if (global_doShutdown) {
            break;
        }
        global_funcPtr(global_argPtr);
        THREAD_BARRIER(global_barrierPtr, threadId); /* wait for end parallel */
        if (threadId == 0) {
            break;
        }
    }
}
Beispiel #5
0
DWORD workerThreadMain(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);

    {
        char threadName[64];
        sprintf_s(threadName,
#if defined(_WIN32)
                  "SWRWorker_%02d_NUMA%d_Core%02d_T%d",
#else
                  // linux pthread name limited to 16 chars (including \0)
                  "w%03d-n%d-c%03d-t%d",
#endif
            workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
        SetCurrentThreadName(threadName);
    }

    RDTSC_INIT(threadId);

    // Only need offset numa index from base for correct masking
    uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    TileSet lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);

    auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };

    uint32_t curDrawBE = 0;
    uint32_t curDrawFE = 0;

    bool bShutdown = false;

    while (true)
    {
        if (bShutdown && !threadHasWork(curDrawBE))
        {
            break;
        }

        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
        {
            _mm_pause();
        }

        if (!threadHasWork(curDrawBE))
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (threadHasWork(curDrawBE))
            {
                lock.unlock();
                continue;
            }

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();
        }

        if (IsBEThread)
        {
            RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
            RDTSC_END(WorkerWorkOnFifoBE, 0);

            WorkOnCompute(pContext, workerId, curDrawBE);
        }

        if (IsFEThread)
        {
            WorkOnFifoFE(pContext, workerId, curDrawFE);

            if (!IsBEThread)
            {
                curDrawBE = curDrawFE;
            }
        }
    }

    return 0;
}
Beispiel #6
0
RSSL_THREAD_DECLARE(runReactorConnectionHandler, pArg)
{
	ProviderThread *pProvThread = (ProviderThread*)pArg;

	TimeValue nextTickTime;
	RsslInt32 currentTicks = 0;
	TimeValue currentTime;
	RsslRet ret;
	int selRet;
	struct timeval time_interval;
	fd_set useRead;
	fd_set useExcept;
	fd_set useWrt;
	RsslErrorInfo rsslErrorInfo;
	RsslReactorDispatchOptions dispatchOptions;
	RsslCreateReactorOptions reactorOpts;

	rsslClearReactorDispatchOptions(&dispatchOptions);

	if (pProvThread->cpuId >= 0)
	{
		if (bindThread(pProvThread->cpuId) != RSSL_RET_SUCCESS)
		{
			printf("Error: Failed to bind thread to core %d.\n", pProvThread->cpuId);
			exit(-1);
		}
	}

	// create reactor
	rsslClearCreateReactorOptions(&reactorOpts);

	if (!(pProvThread->pReactor = rsslCreateReactor(&reactorOpts, &rsslErrorInfo)))
	{
		printf("Reactor creation failed: %s\n", rsslErrorInfo.rsslError.text);
		cleanUpAndExit();
	}

	FD_ZERO(&pProvThread->readfds);
	FD_ZERO(&pProvThread->wrtfds);
	FD_ZERO(&pProvThread->exceptfds);

	/* Set the reactor's event file descriptor on our descriptor set. This, along with the file descriptors
	 * of RsslReactorChannels, will notify us when we should call rsslReactorDispatch(). */
	FD_SET(pProvThread->pReactor->eventFd, &pProvThread->readfds);

	nextTickTime = getTimeNano() + nsecPerTick;

	/* this is the main loop */
	while(rtrLikely(!signal_shutdown))
	{
		/* Loop on select(), looking for channels with available data, until stopTimeNsec is reached. */
		do
		{
#ifdef WIN32
			/* Windows does not allow select() to be called with empty file descriptor sets. */
			if (pProvThread->readfds.fd_count == 0)
			{
				currentTime = getTimeNano();
				selRet = 0;
				Sleep((DWORD)((currentTime < nextTickTime) ? (nextTickTime - currentTime)/1000000 : 0));
			}
			else
#endif
			{
				useRead = pProvThread->readfds;
				useWrt = pProvThread->wrtfds;
				useExcept = pProvThread->exceptfds;

				currentTime = getTimeNano();
				time_interval.tv_usec = (long)((currentTime < nextTickTime) ? (nextTickTime - currentTime)/1000 : 0);
				time_interval.tv_sec = 0;

				selRet = select(FD_SETSIZE, &useRead, &useWrt, &useExcept, &time_interval);
			}

			if (selRet == 0)
			{
				break;
			}
			else if (selRet > 0)
			{	
				while ((ret = rsslReactorDispatch(pProvThread->pReactor, &dispatchOptions, &rsslErrorInfo)) > RSSL_RET_SUCCESS) {}
				if (ret < RSSL_RET_SUCCESS)
				{
					printf("rsslReactorDispatch failed with return code: %d error = %s\n", ret,  rsslErrorInfo.rsslError.text);
					exit(-1);
				}
			}
#ifdef WIN32
			else if (WSAGetLastError() != WSAEINTR)
#else 
			else if (errno != EINTR)
#endif
			{
				perror("select");
				exit(-1);
			}
		} while (currentTime < nextTickTime);

		nextTickTime += nsecPerTick;

		providerThreadSendMsgBurst(pProvThread, nextTickTime);
	}

	return RSSL_THREAD_RETURN();
}
RSSL_THREAD_DECLARE(runNIProvConnection, pArg)
{

	ProviderThread *pProviderThread = (ProviderThread*)pArg;
	RsslError error;

	TimeValue nextTickTime;
	RsslInt32 currentTicks = 0;
	RsslConnectOptions copts;

	if (pProviderThread->cpuId >= 0)
	{
		if (bindThread(pProviderThread->cpuId) != RSSL_RET_SUCCESS)
		{
			printf("Error: Failed to bind thread to core %d.\n", pProviderThread->cpuId);
			exit(-1);
		}
	}

	/* Configure connection options. */
	rsslClearConnectOpts(&copts);
	copts.guaranteedOutputBuffers = niProvPerfConfig.guaranteedOutputBuffers;
	copts.majorVersion = RSSL_RWF_MAJOR_VERSION;
	copts.minorVersion = RSSL_RWF_MINOR_VERSION;
	copts.protocolType = RSSL_RWF_PROTOCOL_TYPE;
	copts.sysSendBufSize = niProvPerfConfig.sendBufSize;
	copts.sysRecvBufSize = niProvPerfConfig.recvBufSize;
	if (niProvPerfConfig.sAddr || niProvPerfConfig.rAddr)
	{
		if (niProvPerfConfig.connectionType != RSSL_CONN_TYPE_RELIABLE_MCAST)
		{
			printf("Error: Attempting non-multicast segmented connection.\n");
			exit(-1);
		}

		copts.connectionInfo.segmented.recvAddress = niProvPerfConfig.recvAddr;
		copts.connectionInfo.segmented.recvServiceName = niProvPerfConfig.recvPort;
		copts.connectionInfo.segmented.sendAddress = niProvPerfConfig.sendAddr;
		copts.connectionInfo.segmented.sendServiceName = niProvPerfConfig.sendPort;
		copts.connectionInfo.segmented.interfaceName = niProvPerfConfig.interfaceName;
		copts.connectionInfo.unified.unicastServiceName = niProvPerfConfig.unicastPort;		
		copts.connectionType = RSSL_CONN_TYPE_RELIABLE_MCAST;
	}
	else
	{
		copts.connectionInfo.unified.address = niProvPerfConfig.hostName;
		copts.connectionInfo.unified.serviceName = niProvPerfConfig.portNo;
		copts.connectionInfo.unified.interfaceName = niProvPerfConfig.interfaceName;
		copts.tcp_nodelay = niProvPerfConfig.tcpNoDelay;
		copts.connectionType = RSSL_CONN_TYPE_SOCKET;
	}

	/* Setup connection. */

	do
	{
		ProviderSession *pProvSession;
		RsslChannel *pChannel;
		RsslRet ret;

		if (niProvPerfConfig.sAddr || niProvPerfConfig.rAddr)
			printf("\nAttempting segmented connect to server %s:%s  %s:%s unicastPort %s...\n", 
					niProvPerfConfig.sendAddr, niProvPerfConfig.sendPort, niProvPerfConfig.recvAddr, niProvPerfConfig.recvPort, niProvPerfConfig.unicastPort);
		else
			printf("\nAttempting unified connect to server %s:%s...\n", 
					niProvPerfConfig.hostName, niProvPerfConfig.portNo) ;

		if (!(pChannel = rsslConnect(&copts, &error)))
		{
			printf("rsslConnect() failed: %s(%s)\n", rsslRetCodeToString(error.rsslErrorId),
					error.text);
			SLEEP(1);
			continue;
		}

		if (!(pProvSession = providerSessionCreate(pProviderThread, pChannel)))
		{
			printf("providerSessionInit() failed\n");
			exit(-1);
		}
		
		do
		{
			ret = channelHandlerWaitForChannelInit(&pProviderThread->channelHandler, 
					pProvSession->pChannelInfo, 100000);
		} while (!signal_shutdown && ret == RSSL_RET_CHAN_INIT_IN_PROGRESS);

		if (ret < RSSL_RET_SUCCESS)
		{
			SLEEP(1);
			continue;
		}
		else
			break; /* Successful initialization. */

	} while (!signal_shutdown);



	nextTickTime = getTimeNano() + nsecPerTick;

	/* this is the main loop */
	while(rtrLikely(!signal_shutdown))
	{
		for (currentTicks = 0; currentTicks < providerThreadConfig.ticksPerSec; ++currentTicks)
		{
			providerThreadRead(pProviderThread, nextTickTime);

			nextTickTime += nsecPerTick;

			providerThreadSendMsgBurst(pProviderThread, nextTickTime);
		}

		providerThreadCheckPings(pProviderThread);

	}

	return RSSL_THREAD_RETURN();
}
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
    // Bind application thread to HW thread 0
    bindThread(0);

    CPUNumaNodes nodes;
    CalculateProcessorTopology(nodes);

    uint32_t numHWNodes         = (uint32_t)nodes.size();
    uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
    uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();

    uint32_t numNodes           = numHWNodes;
    uint32_t numCoresPerNode    = numHWCoresPerNode;
    uint32_t numHyperThreads    = numHWHyperThreads;

    if (KNOB_MAX_NUMA_NODES)
    {
        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
    }

    if (KNOB_MAX_CORES_PER_NUMA_NODE)
    {
        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
    }

    if (KNOB_MAX_THREADS_PER_CORE)
    {
        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
    }

    // Calculate numThreads
    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;

    if (numThreads > KNOB_MAX_NUM_THREADS)
    {
        printf("WARNING: system thread count %u exceeds max %u, "
            "performance will be degraded\n",
            numThreads, KNOB_MAX_NUM_THREADS);
    }

    if (numThreads == 1)
    {
        // If only 1 worker thread, try to move it to an available
        // HW thread.  If that fails, use the API thread.
        if (numCoresPerNode < numHWCoresPerNode)
        {
            numCoresPerNode++;
        }
        else if (numHyperThreads < numHWHyperThreads)
        {
            numHyperThreads++;
        }
        else if (numNodes < numHWNodes)
        {
            numNodes++;
        }
        else
        {
            pPool->numThreads = 0;
            SET_KNOB(SINGLE_THREADED, true);
            return;
        }
    }
    else
    {
        // Save a HW thread for the API thread.
        numThreads--;
    }

    pPool->numThreads = numThreads;
    pContext->NumWorkerThreads = pPool->numThreads;

    pPool->inThreadShutdown = false;
    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));

    uint32_t workerId = 0;
    for (uint32_t n = 0; n < numNodes; ++n)
    {
        auto& node = nodes[n];

        uint32_t numCores = numCoresPerNode;
        for (uint32_t c = 0; c < numCores; ++c)
        {
            auto& core = node.cores[c];
            for (uint32_t t = 0; t < numHyperThreads; ++t)
            {
                if (c == 0 && n == 0 && t == 0)
                {
                    // Skip core 0, thread0  on node 0 to reserve for API thread
                    continue;
                }

                pPool->pThreadData[workerId].workerId = workerId;
                pPool->pThreadData[workerId].procGroupId = core.procGroup;
                pPool->pThreadData[workerId].threadId = core.threadIds[t];
                pPool->pThreadData[workerId].numaId = n;
                pPool->pThreadData[workerId].pContext = pContext;
                pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]);

                ++workerId;
            }
        }
    }
}
DWORD workerThread(LPVOID pData)
{
    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
    SWR_CONTEXT *pContext = pThreadData->pContext;
    uint32_t threadId = pThreadData->threadId;
    uint32_t workerId = pThreadData->workerId;

    bindThread(threadId, pThreadData->procGroupId); 

    RDTSC_INIT(threadId);

    int numaNode = (int)pThreadData->numaId;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);

    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
    // locked then we'll add it to this list so that we don't try and lock it again.
    std::unordered_set<uint32_t> lockedTiles;

    // each worker has the ability to work on any of the queued draws as long as certain
    // conditions are met. the data associated
    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
    // has moved on to the next draw when he determines there is no more work to do. The api
    // thread will not increment the head of the dc ring until all workers have moved past the
    // current head.
    // the logic to determine what to work on is:
    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
    //    trying until he reaches the tail.
    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
    //    any work left by comparing the total # of binned work items and the total # of completed
    //    work items. If they are equal, then there is no more work to do for this draw, and
    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
    while (pContext->threadPool.inThreadShutdown == false)
    {
        uint32_t loop = 0;
        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued)
        {
            _mm_pause();
        }

        if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued)
        {
            lock.lock();

            // check for thread idle condition again under lock
            if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued)
            {
                lock.unlock();
                continue;
            }

            if (pContext->threadPool.inThreadShutdown)
            {
                lock.unlock();
                break;
            }

            RDTSC_START(WorkerWaitForThreadEvent);

            pContext->FifosNotEmpty.wait(lock);
            lock.unlock();

            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);

            if (pContext->threadPool.inThreadShutdown)
            {
                break;
            }
        }

        RDTSC_START(WorkerWorkOnFifoBE);
        WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles);
        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);

        WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]);

        WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode);
    }

    return 0;
}