/* ============================================================================= * threadWait * -- Synchronizes all threads to start/stop parallel section * ============================================================================= */ static void threadWait (void* argPtr) { thread_args_t* args = (thread_args_t*) argPtr; long threadId = args->threadId; commits = &(args->commits); aborts = &(args->aborts); retriesProf = &(args->retries); ucbProf = &(args->ucb); int sz = 100; memoized_blocks = (memoized_choices_t*) malloc(sz * sizeof(memoized_choices_t)); for (sz--; sz >= 0; sz-- ) { memoized_choices_t* block = &(memoized_blocks[sz]); block->runs = 0; block->havingCapacityAborts = 0; block->retries = 0; block->commitsHTM = 1; block->believedCapacity = 1; block->believedTransient = 1; block->believedGiveUp = 1; block->abortsCapacity = 0; block->abortsTransient = 0; block->cyclesCapacity = 100; block->cyclesTransient = 100; block->cyclesGiveUp = 100; block->retries = 5; block->lastCycles = 0; block->lastRetries = 5; block->bestEverCycles = 0; block->bestEverRetries = 5; } randomFallback = random_alloc(); random_seed(randomFallback, time(NULL)); THREAD_LOCAL_SET(global_threadId, (long)threadId); bindThread(threadId); while (1) { THREAD_BARRIER(global_barrierPtr, threadId); /* wait for start parallel */ if (global_doShutdown) { break; } global_funcPtr(global_argPtr); THREAD_BARRIER(global_barrierPtr, threadId); /* wait for end parallel */ if (threadId == 0) { break; } } }
void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId) { if (nullptr == pContext) { return; } if (apiThreadId >= pContext->threadPool.numReservedThreads) { if (pContext->threadPool.numReservedThreads) { const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0]; // Just bind to the process group used for API thread 0 bindThread(pContext, 0, threadData.procGroupId, true); } return; } const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId]; bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup); }
RSSL_THREAD_DECLARE(runChannelConnectionHandler, pArg) { ProviderThread *pProvThread = (ProviderThread*)pArg; TimeValue nextTickTime; RsslInt32 currentTicks = 0; if (pProvThread->cpuId >= 0) { if (bindThread(pProvThread->cpuId) != RSSL_RET_SUCCESS) { printf("Error: Failed to bind thread to core %d.\n", pProvThread->cpuId); exit(-1); } } nextTickTime = getTimeNano() + nsecPerTick; /* this is the main loop */ while(rtrLikely(!signal_shutdown)) { for (currentTicks = 0; currentTicks < providerThreadConfig.ticksPerSec; ++currentTicks) { providerThreadRead(pProvThread, nextTickTime); nextTickTime += nsecPerTick; providerThreadSendMsgBurst(pProvThread, nextTickTime); providerThreadReceiveNewChannels(pProvThread); } providerThreadCheckPings(pProvThread); } return RSSL_THREAD_RETURN(); }
static void threadWait (void* argPtr) { long threadId = *(long*)argPtr; THREAD_LOCAL_SET(global_threadId, (long)threadId); bindThread(threadId); thread_id = threadId; while (1) { THREAD_BARRIER(global_barrierPtr, threadId); /* wait for start parallel */ if (global_doShutdown) { break; } global_funcPtr(global_argPtr); THREAD_BARRIER(global_barrierPtr, threadId); /* wait for end parallel */ if (threadId == 0) { break; } } }
DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); { char threadName[64]; sprintf_s(threadName, #if defined(_WIN32) "SWRWorker_%02d_NUMA%d_Core%02d_T%d", #else // linux pthread name limited to 16 chars (including \0) "w%03d-n%d-c%03d-t%d", #endif workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId); SetCurrentThreadName(threadName); } RDTSC_INIT(threadId); // Only need offset numa index from base for correct masking uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE; uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); auto threadHasWork = [&](uint32_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; uint32_t curDrawBE = 0; uint32_t curDrawFE = 0; bool bShutdown = false; while (true) { if (bShutdown && !threadHasWork(curDrawBE)) { break; } uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE)) { _mm_pause(); } if (!threadHasWork(curDrawBE)) { lock.lock(); // check for thread idle condition again under lock if (threadHasWork(curDrawBE)) { lock.unlock(); continue; } pContext->FifosNotEmpty.wait(lock); lock.unlock(); } if (IsBEThread) { RDTSC_BEGIN(WorkerWorkOnFifoBE, 0); bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_END(WorkerWorkOnFifoBE, 0); WorkOnCompute(pContext, workerId, curDrawBE); } if (IsFEThread) { WorkOnFifoFE(pContext, workerId, curDrawFE); if (!IsBEThread) { curDrawBE = curDrawFE; } } } return 0; }
RSSL_THREAD_DECLARE(runReactorConnectionHandler, pArg) { ProviderThread *pProvThread = (ProviderThread*)pArg; TimeValue nextTickTime; RsslInt32 currentTicks = 0; TimeValue currentTime; RsslRet ret; int selRet; struct timeval time_interval; fd_set useRead; fd_set useExcept; fd_set useWrt; RsslErrorInfo rsslErrorInfo; RsslReactorDispatchOptions dispatchOptions; RsslCreateReactorOptions reactorOpts; rsslClearReactorDispatchOptions(&dispatchOptions); if (pProvThread->cpuId >= 0) { if (bindThread(pProvThread->cpuId) != RSSL_RET_SUCCESS) { printf("Error: Failed to bind thread to core %d.\n", pProvThread->cpuId); exit(-1); } } // create reactor rsslClearCreateReactorOptions(&reactorOpts); if (!(pProvThread->pReactor = rsslCreateReactor(&reactorOpts, &rsslErrorInfo))) { printf("Reactor creation failed: %s\n", rsslErrorInfo.rsslError.text); cleanUpAndExit(); } FD_ZERO(&pProvThread->readfds); FD_ZERO(&pProvThread->wrtfds); FD_ZERO(&pProvThread->exceptfds); /* Set the reactor's event file descriptor on our descriptor set. This, along with the file descriptors * of RsslReactorChannels, will notify us when we should call rsslReactorDispatch(). */ FD_SET(pProvThread->pReactor->eventFd, &pProvThread->readfds); nextTickTime = getTimeNano() + nsecPerTick; /* this is the main loop */ while(rtrLikely(!signal_shutdown)) { /* Loop on select(), looking for channels with available data, until stopTimeNsec is reached. */ do { #ifdef WIN32 /* Windows does not allow select() to be called with empty file descriptor sets. */ if (pProvThread->readfds.fd_count == 0) { currentTime = getTimeNano(); selRet = 0; Sleep((DWORD)((currentTime < nextTickTime) ? (nextTickTime - currentTime)/1000000 : 0)); } else #endif { useRead = pProvThread->readfds; useWrt = pProvThread->wrtfds; useExcept = pProvThread->exceptfds; currentTime = getTimeNano(); time_interval.tv_usec = (long)((currentTime < nextTickTime) ? (nextTickTime - currentTime)/1000 : 0); time_interval.tv_sec = 0; selRet = select(FD_SETSIZE, &useRead, &useWrt, &useExcept, &time_interval); } if (selRet == 0) { break; } else if (selRet > 0) { while ((ret = rsslReactorDispatch(pProvThread->pReactor, &dispatchOptions, &rsslErrorInfo)) > RSSL_RET_SUCCESS) {} if (ret < RSSL_RET_SUCCESS) { printf("rsslReactorDispatch failed with return code: %d error = %s\n", ret, rsslErrorInfo.rsslError.text); exit(-1); } } #ifdef WIN32 else if (WSAGetLastError() != WSAEINTR) #else else if (errno != EINTR) #endif { perror("select"); exit(-1); } } while (currentTime < nextTickTime); nextTickTime += nsecPerTick; providerThreadSendMsgBurst(pProvThread, nextTickTime); } return RSSL_THREAD_RETURN(); }
RSSL_THREAD_DECLARE(runNIProvConnection, pArg) { ProviderThread *pProviderThread = (ProviderThread*)pArg; RsslError error; TimeValue nextTickTime; RsslInt32 currentTicks = 0; RsslConnectOptions copts; if (pProviderThread->cpuId >= 0) { if (bindThread(pProviderThread->cpuId) != RSSL_RET_SUCCESS) { printf("Error: Failed to bind thread to core %d.\n", pProviderThread->cpuId); exit(-1); } } /* Configure connection options. */ rsslClearConnectOpts(&copts); copts.guaranteedOutputBuffers = niProvPerfConfig.guaranteedOutputBuffers; copts.majorVersion = RSSL_RWF_MAJOR_VERSION; copts.minorVersion = RSSL_RWF_MINOR_VERSION; copts.protocolType = RSSL_RWF_PROTOCOL_TYPE; copts.sysSendBufSize = niProvPerfConfig.sendBufSize; copts.sysRecvBufSize = niProvPerfConfig.recvBufSize; if (niProvPerfConfig.sAddr || niProvPerfConfig.rAddr) { if (niProvPerfConfig.connectionType != RSSL_CONN_TYPE_RELIABLE_MCAST) { printf("Error: Attempting non-multicast segmented connection.\n"); exit(-1); } copts.connectionInfo.segmented.recvAddress = niProvPerfConfig.recvAddr; copts.connectionInfo.segmented.recvServiceName = niProvPerfConfig.recvPort; copts.connectionInfo.segmented.sendAddress = niProvPerfConfig.sendAddr; copts.connectionInfo.segmented.sendServiceName = niProvPerfConfig.sendPort; copts.connectionInfo.segmented.interfaceName = niProvPerfConfig.interfaceName; copts.connectionInfo.unified.unicastServiceName = niProvPerfConfig.unicastPort; copts.connectionType = RSSL_CONN_TYPE_RELIABLE_MCAST; } else { copts.connectionInfo.unified.address = niProvPerfConfig.hostName; copts.connectionInfo.unified.serviceName = niProvPerfConfig.portNo; copts.connectionInfo.unified.interfaceName = niProvPerfConfig.interfaceName; copts.tcp_nodelay = niProvPerfConfig.tcpNoDelay; copts.connectionType = RSSL_CONN_TYPE_SOCKET; } /* Setup connection. */ do { ProviderSession *pProvSession; RsslChannel *pChannel; RsslRet ret; if (niProvPerfConfig.sAddr || niProvPerfConfig.rAddr) printf("\nAttempting segmented connect to server %s:%s %s:%s unicastPort %s...\n", niProvPerfConfig.sendAddr, niProvPerfConfig.sendPort, niProvPerfConfig.recvAddr, niProvPerfConfig.recvPort, niProvPerfConfig.unicastPort); else printf("\nAttempting unified connect to server %s:%s...\n", niProvPerfConfig.hostName, niProvPerfConfig.portNo) ; if (!(pChannel = rsslConnect(&copts, &error))) { printf("rsslConnect() failed: %s(%s)\n", rsslRetCodeToString(error.rsslErrorId), error.text); SLEEP(1); continue; } if (!(pProvSession = providerSessionCreate(pProviderThread, pChannel))) { printf("providerSessionInit() failed\n"); exit(-1); } do { ret = channelHandlerWaitForChannelInit(&pProviderThread->channelHandler, pProvSession->pChannelInfo, 100000); } while (!signal_shutdown && ret == RSSL_RET_CHAN_INIT_IN_PROGRESS); if (ret < RSSL_RET_SUCCESS) { SLEEP(1); continue; } else break; /* Successful initialization. */ } while (!signal_shutdown); nextTickTime = getTimeNano() + nsecPerTick; /* this is the main loop */ while(rtrLikely(!signal_shutdown)) { for (currentTicks = 0; currentTicks < providerThreadConfig.ticksPerSec; ++currentTicks) { providerThreadRead(pProviderThread, nextTickTime); nextTickTime += nsecPerTick; providerThreadSendMsgBurst(pProviderThread, nextTickTime); } providerThreadCheckPings(pProviderThread); } return RSSL_THREAD_RETURN(); }
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { // Bind application thread to HW thread 0 bindThread(0); CPUNumaNodes nodes; CalculateProcessorTopology(nodes); uint32_t numHWNodes = (uint32_t)nodes.size(); uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size(); uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size(); uint32_t numNodes = numHWNodes; uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; if (KNOB_MAX_NUMA_NODES) { numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); } if (KNOB_MAX_CORES_PER_NUMA_NODE) { numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE); } if (KNOB_MAX_THREADS_PER_CORE) { numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); } // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; if (numThreads > KNOB_MAX_NUM_THREADS) { printf("WARNING: system thread count %u exceeds max %u, " "performance will be degraded\n", numThreads, KNOB_MAX_NUM_THREADS); } if (numThreads == 1) { // If only 1 worker thread, try to move it to an available // HW thread. If that fails, use the API thread. if (numCoresPerNode < numHWCoresPerNode) { numCoresPerNode++; } else if (numHyperThreads < numHWHyperThreads) { numHyperThreads++; } else if (numNodes < numHWNodes) { numNodes++; } else { pPool->numThreads = 0; SET_KNOB(SINGLE_THREADED, true); return; } } else { // Save a HW thread for the API thread. numThreads--; } pPool->numThreads = numThreads; pContext->NumWorkerThreads = pPool->numThreads; pPool->inThreadShutdown = false; pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); uint32_t workerId = 0; for (uint32_t n = 0; n < numNodes; ++n) { auto& node = nodes[n]; uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) { auto& core = node.cores[c]; for (uint32_t t = 0; t < numHyperThreads; ++t) { if (c == 0 && n == 0 && t == 0) { // Skip core 0, thread0 on node 0 to reserve for API thread continue; } pPool->pThreadData[workerId].workerId = workerId; pPool->pThreadData[workerId].procGroupId = core.procGroup; pPool->pThreadData[workerId].threadId = core.threadIds[t]; pPool->pThreadData[workerId].numaId = n; pPool->pThreadData[workerId].pContext = pContext; pPool->threads[workerId] = new std::thread(workerThread, &pPool->pThreadData[workerId]); ++workerId; } } } }
DWORD workerThread(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; SWR_CONTEXT *pContext = pThreadData->pContext; uint32_t threadId = pThreadData->threadId; uint32_t workerId = pThreadData->workerId; bindThread(threadId, pThreadData->procGroupId); RDTSC_INIT(threadId); int numaNode = (int)pThreadData->numaId; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. std::unordered_set<uint32_t> lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated // with a draw is guaranteed to be active as long as a worker hasn't signaled that he // has moved on to the next draw when he determines there is no more work to do. The api // thread will not increment the head of the dc ring until all workers have moved past the // current head. // the logic to determine what to work on is: // 1- try to work on the FE any draw that is queued. For now there are no dependencies // on the FE work, so any worker can grab any FE and process in parallel. Eventually // we'll need dependency tracking to force serialization on FEs. The worker will try // to pick an FE by atomically incrementing a counter in the swr context. he'll keep // trying until he reaches the tail. // 2- BE work must be done in strict order. we accomplish this today by pulling work off // the oldest draw (ie the head) of the dcRing. the worker can determine if there is // any work left by comparing the total # of binned work items and the total # of completed // work items. If they are equal, then there is no more work to do for this draw, and // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); while (pContext->threadPool.inThreadShutdown == false) { uint32_t loop = 0; while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && pContext->WorkerBE[workerId] == pContext->DrawEnqueued) { _mm_pause(); } if (pContext->WorkerBE[workerId] == pContext->DrawEnqueued) { lock.lock(); // check for thread idle condition again under lock if (pContext->WorkerBE[workerId] != pContext->DrawEnqueued) { lock.unlock(); continue; } if (pContext->threadPool.inThreadShutdown) { lock.unlock(); break; } RDTSC_START(WorkerWaitForThreadEvent); pContext->FifosNotEmpty.wait(lock); lock.unlock(); RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0); if (pContext->threadPool.inThreadShutdown) { break; } } RDTSC_START(WorkerWorkOnFifoBE); WorkOnFifoBE(pContext, workerId, pContext->WorkerBE[workerId], lockedTiles); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, pContext->WorkerBE[workerId]); WorkOnFifoFE(pContext, workerId, pContext->WorkerFE[workerId], numaNode); } return 0; }