void CXtzThreadPool::AddWork( WorkerFuntion worker, void* arg ) { Lock<SpinLock> lock( m_lockObject ); if ( m_threadquere.empty() ) { m_workquere.emplace_back( worker, arg ); } else { CXtzThread* thread = m_threadquere.front( ); m_threadquere.pop_front( ); WORK threadWork( worker, arg ); thread->SetWork( threadWork ); thread->WakeUp( ); } }
//initialize perthread arrays //uint32_t threadMain(const uint32_t num_requests) uint32_t threadMain(void* thread_id) { // uint32_t g_num_elements_initial = g_num_elements; g_num_elements_initial = g_num_elements; uint32_t tid = (int) *(int*) thread_id; // Accesses should hit cache-line addresses, to eliminate spatial locality // and force more misses. Therefore, make the array size a multiple of the // stride; round up we have to deal with this also working once we split // array into smaller pieces if ((g_num_elements / g_num_requests) % CACHELINE_SZ != 0 || g_num_elements < g_num_requests) { uint32_t num_elements_per_req = g_num_elements / g_num_requests; num_elements_per_req = num_elements_per_req + ((CACHELINE_SZ) - (num_elements_per_req % CACHELINE_SZ)); g_num_elements = g_num_requests * num_elements_per_req; } #ifdef DEBUG fprintf(stderr, "adjusted size of array = %d\n", g_num_elements); #endif uint32_t* arr_n_ptr = (uint32_t *) malloc((g_num_elements) * sizeof(uint32_t)); for (uint32_t i = 0; i < g_num_elements; i++) arr_n_ptr[i] = 1337; uint32_t stride = CACHELINE_SZ; // Provide each request its own "array" to pointer chase on This prevents // the processor from consolidating request streams The fact that we are // using a single array to hold all of this is a bit too "clever", but it // saves cycles in the critical loop from figuring out which array to use. for (int i=0; i < g_num_requests; i++) { uint32_t num_elements_per_req = g_num_elements / g_num_requests; //printf("tid=%d, i=%d of %d, num_elements = %d, num_el_per_req= %d Size Per Req= %d bytes cacheline_sz = %d, (num_el_per_req mod cl_sz=%d) &(array[%d])\n\n", // tid, // i, // g_num_requests, // g_num_elements, // num_elements_per_req, // num_elements_per_req * 4, // CACHELINE_SZ, // num_elements_per_req % CACHELINE_SZ, // i * num_elements_per_req // ); initializeGlobalArrays( arr_n_ptr, num_elements_per_req, stride, i * num_elements_per_req); } // printf("tid=%d SUCCESS\n", tid); // this volatile ret_val is crucial, otherwise the entire run-loop // gets optimized away! :( // uint32_t volatile ret_val = threadWork(g_num_requests); uint32_t volatile ret_val = threadWork(tid, arr_n_ptr); return ret_val; }