int EnterTaskManager () { if (task_manager) { // no task manager started return 0; } task_manager = new TaskManager(); cout << IM(3) << "task-based parallelization (C++11 threads) using "<< task_manager->GetNumThreads() << " threads" << endl; #ifdef USE_NUMA numa_run_on_node (0); #endif #ifndef WIN32 // master has maximal priority ! int policy; struct sched_param param; pthread_getschedparam(pthread_self(), &policy, ¶m); param.sched_priority = sched_get_priority_max(policy); pthread_setschedparam(pthread_self(), policy, ¶m); #endif // WIN32 task_manager->StartWorkers(); ParallelFor (Range(100), [&] (int i) { ; }); // startup return task_manager->GetNumThreads(); }
int main(int argc, char** argv) { if(argc < 3) { std::cout << "Usage:" << std::endl << "gland <localNodeId> <remoteNodeId> [--human]" <<std::endl; exit(EXIT_FAILURE); } if(argc == 4 && strncmp(argv[3],"--human",7) == 0) { std::cout << "enabling human readable output" << std::endl; humanreadable = true; } size_t localNode = std::stoi(argv[1]); size_t remoteNode = std::stoi(argv[2]); numa_run_on_node(localNode); double *a = (double*) numa_alloc_onnode( N * sizeof(double), localNode ); double *b = (double*) numa_alloc_onnode( N * sizeof(double), remoteNode ); for(int i = 0; i<N ; i++) { a[i]=(double)i; b[i]=(double)-i; } while(1) memcpy_task(a, b); }
dd::GibbsSampling::GibbsSampling(FactorGraph * const _p_fg, CmdParser * const _p_cmd_parser, int n_datacopy, bool sample_evidence, int burn_in, bool learn_non_evidence) : p_fg(_p_fg), p_cmd_parser(_p_cmd_parser), sample_evidence(sample_evidence), burn_in(burn_in), learn_non_evidence(learn_non_evidence) { // the highest node number available n_numa_nodes = numa_max_node(); // if n_datacopy is valid, use it, otherwise, use numa_max_node if (n_datacopy >= 1 && n_datacopy <= n_numa_nodes + 1) { n_numa_nodes = n_datacopy - 1; } // max possible threads per NUMA node n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_nodes+1); this->factorgraphs.push_back(*p_fg); // copy factor graphs for(int i=1;i<=n_numa_nodes;i++){ numa_run_on_node(i); numa_set_localalloc(); std::cout << "CREATE FG ON NODE ..." << i << std::endl; dd::FactorGraph fg(p_fg->n_var, p_fg->n_factor, p_fg->n_weight, p_fg->n_edge); fg.copy_from(p_fg); this->factorgraphs.push_back(fg); } };
static void send_from_node(int node_id, int family, int proto) { struct sockaddr_storage saddr, daddr; struct sockaddr_in *saddr4, *daddr4; struct sockaddr_in6 *saddr6, *daddr6; int fd; switch (family) { case AF_INET: saddr4 = (struct sockaddr_in *)&saddr; saddr4->sin_family = AF_INET; saddr4->sin_addr.s_addr = htonl(INADDR_ANY); saddr4->sin_port = 0; daddr4 = (struct sockaddr_in *)&daddr; daddr4->sin_family = AF_INET; daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK); daddr4->sin_port = htons(PORT); break; case AF_INET6: saddr6 = (struct sockaddr_in6 *)&saddr; saddr6->sin6_family = AF_INET6; saddr6->sin6_addr = in6addr_any; saddr6->sin6_port = 0; daddr6 = (struct sockaddr_in6 *)&daddr; daddr6->sin6_family = AF_INET6; daddr6->sin6_addr = in6addr_loopback; daddr6->sin6_port = htons(PORT); break; default: error(1, 0, "Unsupported family %d", family); } if (numa_run_on_node(node_id) < 0) error(1, errno, "failed to pin to node"); fd = socket(family, proto, 0); if (fd < 0) error(1, errno, "failed to create send socket"); if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr))) error(1, errno, "failed to bind send socket"); if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr))) error(1, errno, "failed to connect send socket"); if (send(fd, "a", 1, 0) < 0) error(1, errno, "failed to send message"); close(fd); }
void ConfigureTableThread() { int32_t idx = ThreadContext::get_id() - GlobalContext::get_head_table_thread_id(); int32_t node_id = idx % num_mem_nodes_; CHECK_EQ(numa_run_on_node(node_id), 0); struct bitmask *mask = numa_allocate_nodemask(); mask = numa_bitmask_setbit(mask, node_id); // set NUMA zone binding to be prefer numa_set_bind_policy(0); numa_set_membind(mask); numa_free_nodemask(mask); }
// worker thread code void* ThreadFunction( void* p ) { struct TArgs * pargs = (struct TArgs*)(p); // bind to specific NUMA node numa_run_on_node( pargs->node ); // create a local random list char * rand_list = (char*)mai_malloc(MAJOR_COUNT); if( rand_list == NULL ) { printf( "Unable to allocated random list!\n" ); return NULL; } memcpy( rand_list, rand_nums + ( MAJOR_COUNT * pargs->no ), MAJOR_COUNT ); // get the start tick count unsigned long long tstart = get_rdtsc_sys(); // repeat the tests int repcount = 0; for(repcount = 0; repcount < MAJOR_COUNT; repcount++ ) { // allocate random size memory block int sizeclass = rand_list[repcount]; size_t block_size = 1 << sizeclass; char * pdata = (char *)mai_malloc( block_size ); if( pdata == NULL ) { printf( "thread %d failed to allocate %d bytes of memory!\n", pargs->no, block_size ); continue; } // access the memory int q; for(q = 0; q < MINOR_COUNT; q++ ) AccessMemory( pdata, block_size ); // free the memory mai_free( pdata ); } // get the end tick count unsigned long long tend = get_rdtsc_sys(); // calculate the tick count needed to complete the tests pargs->tdiff = tend - tstart; // free the random list mai_free( rand_list ); }
void numaLoop(int numaNode, int numberOfThreads) { m_timers.start(omp_get_max_threads()); #pragma omp parallel num_threads(numberOfThreads) { // tie thread to a NUMA node numa_run_on_node(numaNode); numa_set_preferred(numaNode); executeThreadWork(); } // end omp parallel m_timers.stop(); assert(m_queue.unsafe_size() == 0); }
void INTERNAL qt_affinity_set(qthread_worker_t *me, unsigned int Q_UNUSED(nw)) { /*{{{ */ assert(me); qthread_shepherd_t *const myshep = me->shepherd; /* It would be nice if we could do something more specific than * "numa_run_on_node", but because sched_etaffinity() is so dangerous, we * really can't, in good conscience. */ qthread_debug(AFFINITY_FUNCTIONS, "calling numa_run_on_node(%i) for worker %i\n", myshep->node, me->packed_worker_id); int ret = numa_run_on_node(myshep->node); if (ret != 0) { numa_error("setting thread affinity"); abort(); } numa_set_localalloc(); } /*}}} */
void loop() { m_timers.start(omp_get_max_threads()); m_idle_count = 0; #pragma omp parallel { if (gopt_memory_type == "mmap_node0") { // tie thread to first NUMA node numa_run_on_node(0); numa_set_preferred(0); } executeThreadWork(); } // end omp parallel m_timers.stop(); assert(m_queue.unsafe_size() == 0); }
static void set_affinity(pid_t tid, int cpu_id) { if(!get_shm()->active) return; if(!get_shm()->per_node) { cpu_set_t mask; CPU_ZERO(&mask); CPU_SET(cpu_id, &mask); VERBOSE("--> Setting tid %d on core %d\n", tid, cpu_id); int r = old_sched_setaffinity(tid, sizeof(mask), &mask); if (r < 0) { fprintf(stderr, "couldn't set affinity on %d\n", cpu_id); exit(1); } } else { int r = numa_run_on_node(numa_node_of_cpu(cpu_id)); if(r < 0) { fprintf(stderr, "couldn't set affinity on node of cpu %d\n", cpu_id); exit(1); } } }
/* static */ void ThreadPool::setThreadNodeAffinity(int numaNode) { #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 GROUP_AFFINITY groupAffinity; if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity)) { if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask)) return; } x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode); #elif HAVE_LIBNUMA if (numa_available() >= 0) { numa_run_on_node(numaNode); numa_set_preferred(numaNode); numa_set_localalloc(); return; } x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode); #else (void)numaNode; #endif }
void gibbs(dd::CmdParser & cmd_parser){ // number of NUMA nodes int n_numa_node = numa_max_node() + 1; // number of max threads per NUMA node int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node); // get command line arguments std::string fg_file = cmd_parser.fg_file->getValue(); std::string weight_file = cmd_parser.weight_file->getValue(); std::string variable_file = cmd_parser.variable_file->getValue(); std::string factor_file = cmd_parser.factor_file->getValue(); std::string edge_file = cmd_parser.edge_file->getValue(); std::string meta_file = cmd_parser.meta_file->getValue(); std::string output_folder = cmd_parser.output_folder->getValue(); int n_learning_epoch = cmd_parser.n_learning_epoch->getValue(); int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue(); int n_inference_epoch = cmd_parser.n_inference_epoch->getValue(); double stepsize = cmd_parser.stepsize->getValue(); double stepsize2 = cmd_parser.stepsize2->getValue(); // hack to support two parameters to specify step size if (stepsize == 0.01) stepsize = stepsize2; double decay = cmd_parser.decay->getValue(); int n_datacopy = cmd_parser.n_datacopy->getValue(); double reg_param = cmd_parser.reg_param->getValue(); double reg1_param = cmd_parser.reg1_param->getValue(); bool is_quiet = cmd_parser.quiet->getValue(); bool sample_evidence = cmd_parser.sample_evidence->getValue(); int burn_in = cmd_parser.burn_in->getValue(); bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue(); Meta meta = read_meta(fg_file); if (is_quiet) { std::cout << "Running in quiet mode..." << std::endl; } else { std::cout << std::endl; std::cout << "#################MACHINE CONFIG#################" << std::endl; std::cout << "# # NUMA Node : " << n_numa_node << std::endl; std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl; std::cout << "################################################" << std::endl; std::cout << std::endl; std::cout << "#################GIBBS SAMPLING#################" << std::endl; std::cout << "# fg_file : " << fg_file << std::endl; std::cout << "# edge_file : " << edge_file << std::endl; std::cout << "# weight_file : " << weight_file << std::endl; std::cout << "# variable_file : " << variable_file << std::endl; std::cout << "# factor_file : " << factor_file << std::endl; std::cout << "# meta_file : " << meta_file << std::endl; std::cout << "# output_folder : " << output_folder << std::endl; std::cout << "# n_learning_epoch : " << n_learning_epoch << std::endl; std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl; std::cout << "# n_inference_epoch : " << n_inference_epoch << std::endl; std::cout << "# stepsize : " << stepsize << std::endl; std::cout << "# decay : " << decay << std::endl; std::cout << "# regularization : " << reg_param << std::endl; std::cout << "# l1 regularization : " << reg1_param << std::endl; std::cout << "################################################" << std::endl; std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl; std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl; std::cout << "################################################" << std::endl; std::cout << "# nvar : " << meta.num_variables << std::endl; std::cout << "# nfac : " << meta.num_factors << std::endl; std::cout << "# nweight : " << meta.num_weights << std::endl; std::cout << "# nedge : " << meta.num_edges << std::endl; std::cout << "################################################" << std::endl; } // run on NUMA node 0 numa_run_on_node(0); numa_set_localalloc(); // load factor graph dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges); fg.load(cmd_parser, is_quiet); dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence); // number of learning epochs // the factor graph is copied on each NUMA node, so the total epochs = // epochs specified / number of NUMA nodes int numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + (n_learning_epoch%n_numa_node==0?0:1); // learning /*gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/ gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, is_quiet); // dump weights gibbs.dump_weights(is_quiet); // number of inference epochs int numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) + (n_inference_epoch%n_numa_node==0?0:1); // inference gibbs.inference(numa_aware_n_epoch, is_quiet); gibbs.aggregate_results_and_dump(is_quiet); // print weights from inference result for(long t=0;t<fg.n_weight;t++) { std::cout<<fg.infrs->weight_values[t]<<std::endl; } // print weights from factor graph std::cout<<"PRINTING WEIGHTS FROM WEIGHT VARIABLE"<<std::endl; for(long t=0;t<fg.n_weight;t++) { std::cout<<fg.weights[t].weight<<std::endl; } }
void em(dd::CmdParser & cmd_parser){ // number of NUMA nodes int n_numa_node = numa_max_node() + 1; // number of max threads per NUMA node int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node); // get command line arguments std::string fg_file = cmd_parser.fg_file->getValue(); std::string weight_file = cmd_parser.weight_file->getValue(); std::string variable_file = cmd_parser.variable_file->getValue(); std::string factor_file = cmd_parser.factor_file->getValue(); std::string edge_file = cmd_parser.edge_file->getValue(); std::string meta_file = cmd_parser.meta_file->getValue(); std::string output_folder = cmd_parser.output_folder->getValue(); int n_learning_epoch = cmd_parser.n_learning_epoch->getValue(); int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue(); int n_inference_epoch = cmd_parser.n_inference_epoch->getValue(); double stepsize = cmd_parser.stepsize->getValue(); double stepsize2 = cmd_parser.stepsize2->getValue(); // hack to support two parameters to specify step size if (stepsize == 0.01) stepsize = stepsize2; double decay = cmd_parser.decay->getValue(); int n_datacopy = cmd_parser.n_datacopy->getValue(); double reg_param = cmd_parser.reg_param->getValue(); double reg1_param = cmd_parser.reg1_param->getValue(); bool is_quiet = cmd_parser.quiet->getValue(); bool check_convergence = cmd_parser.check_convergence->getValue(); bool sample_evidence = cmd_parser.sample_evidence->getValue(); int burn_in = cmd_parser.burn_in->getValue(); int n_iter = cmd_parser.n_iter->getValue(); int wl_conv = cmd_parser.wl_conv->getValue(); int delta = cmd_parser.delta->getValue(); bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue(); Meta meta = read_meta(fg_file); if (is_quiet) { std::cout << "Running in quiet mode..." << std::endl; } else { std::cout << std::endl; std::cout << "#################MACHINE CONFIG#################" << std::endl; std::cout << "# # NUMA Node : " << n_numa_node << std::endl; std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl; std::cout << "################################################" << std::endl; std::cout << std::endl; std::cout << "#################GIBBS SAMPLING#################" << std::endl; std::cout << "# fg_file : " << fg_file << std::endl; std::cout << "# edge_file : " << edge_file << std::endl; std::cout << "# weight_file : " << weight_file << std::endl; std::cout << "# variable_file : " << variable_file << std::endl; std::cout << "# factor_file : " << factor_file << std::endl; std::cout << "# meta_file : " << meta_file << std::endl; std::cout << "# output_folder : " << output_folder << std::endl; std::cout << "# n_learning_epoch : " << n_learning_epoch << std::endl; std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl; std::cout << "# n_inference_epoch : " << n_inference_epoch << std::endl; std::cout << "# stepsize : " << stepsize << std::endl; std::cout << "# decay : " << decay << std::endl; std::cout << "# regularization : " << reg_param << std::endl; std::cout << "# l1 regularization : " << reg1_param << std::endl; std::cout << "################################################" << std::endl; std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl; std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl; std::cout << "################################################" << std::endl; std::cout << "# nvar : " << meta.num_variables << std::endl; std::cout << "# nfac : " << meta.num_factors << std::endl; std::cout << "# nweight : " << meta.num_weights << std::endl; std::cout << "# nedge : " << meta.num_edges << std::endl; std::cout << "################################################" << std::endl; } // run on NUMA node 0 numa_run_on_node(0); numa_set_localalloc(); // load factor graph dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges); fg.load(cmd_parser, is_quiet); dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence); // Initialize EM instance dd::ExpMax expMax(&fg, &gibbs, wl_conv, delta, check_convergence); // number of inference epochs int numa_aware_n_epoch; int numa_aware_n_learning_epoch; // EM init -- run Maximzation (semi-supervised learning) // Maximization step numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + (n_learning_epoch%n_numa_node==0?0:1); expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, is_quiet); /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/ while (!expMax.hasConverged && n_iter > 0) { // Expectation step numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) + (n_inference_epoch%n_numa_node==0?0:1); expMax.expectation(numa_aware_n_epoch,is_quiet); // Maximization step numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + (n_learning_epoch%n_numa_node==0?0:1); /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/ expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, is_quiet); //Decrement iteration counter n_iter--; } expMax.dump_weights(is_quiet); expMax.aggregate_results_and_dump(is_quiet); }
/* * Class: xerial_jnuma_NumaNative * Method: runOnNode * Signature: (I)V */ JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_runOnNode (JNIEnv *env, jobject obj, jint node) { numa_run_on_node((int) node); }
void initQuda(int dev) { static int initialized = 0; if (initialized) { return; } initialized = 1; #if (CUDA_VERSION >= 4000) && defined(MULTI_GPU) //check if CUDA_NIC_INTEROP is set to 1 in the enviroment char* cni_str = getenv("CUDA_NIC_INTEROP"); if(cni_str == NULL){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n"); } int cni_int = atoi(cni_str); if (cni_int != 1){ errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n"); } #endif int deviceCount; cudaGetDeviceCount(&deviceCount); if (deviceCount == 0) { errorQuda("No devices supporting CUDA"); } for(int i=0; i<deviceCount; i++) { cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, i); printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name); } #ifdef QMP_COMMS int ndim; const int *dim; if ( QMP_is_initialized() != QMP_TRUE ) { errorQuda("QMP is not initialized"); } num_QMP=QMP_get_number_of_nodes(); rank_QMP=QMP_get_node_number(); dev += rank_QMP % deviceCount; ndim = QMP_get_logical_number_of_dimensions(); dim = QMP_get_logical_dimensions(); #elif defined(MPI_COMMS) comm_init(); dev=comm_gpuid(); #else if (dev < 0) dev = deviceCount - 1; #endif // Used for applying the gauge field boundary condition if( commCoords(3) == 0 ) qudaPt0=true; else qudaPt0=false; if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true; else qudaPtNm1=false; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, dev); if (deviceProp.major < 1) { errorQuda("Device %d does not support CUDA", dev); } printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name); cudaSetDevice(dev); #ifdef HAVE_NUMA if(numa_config_set){ if(gpu_affinity[dev] >=0){ printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]); if(numa_run_on_node(gpu_affinity[dev]) != 0){ printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]); } } } #endif initCache(); quda::initBlas(); }
/* * Class: xerial_jnuma_NumaNative * Method: runOnNode * Signature: (I)V */ JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_runOnNode (JNIEnv *env, jobject obj, jint node) { int ret = numa_run_on_node((int) node); if(ret != 0) throwException(env, obj, errno); }
void TaskManager :: Loop(int thd) { /* static Timer tADD("add entry counter"); static Timer tCASready1("spin-CAS ready tick1"); static Timer tCASready2("spin-CAS ready tick2"); static Timer tCASyield("spin-CAS yield"); static Timer tCAS1("spin-CAS wait"); static Timer texit("exit zone"); static Timer tdec("decrement"); */ thread_id = thd; int thds = GetNumThreads(); int mynode = num_nodes * thd/thds; NodeData & mynode_data = *(nodedata[mynode]); TaskInfo ti; ti.nthreads = thds; ti.thread_nr = thd; // ti.nnodes = num_nodes; // ti.node_nr = mynode; #ifdef USE_NUMA numa_run_on_node (mynode); #endif active_workers++; workers_on_node[mynode]++; int jobdone = 0; #ifdef USE_MKL auto mkl_max = mkl_get_max_threads(); mkl_set_num_threads_local(1); #endif while (!done) { if (complete[mynode] > jobdone) jobdone = complete[mynode]; if (jobnr == jobdone) { // RegionTracer t(ti.thread_nr, tCASyield, ti.task_nr); if(sleep) this_thread::sleep_for(chrono::microseconds(sleep_usecs)); else { #ifdef WIN32 this_thread::yield(); #else // WIN32 sched_yield(); #endif // WIN32 } continue; } { // RegionTracer t(ti.thread_nr, tADD, ti.task_nr); // non-atomic fast check ... if ( (mynode_data.participate & 1) == 0) continue; int oldval = mynode_data.participate += 2; if ( (oldval & 1) == 0) { // job not active, going out again mynode_data.participate -= 2; continue; } } if (startup_function) (*startup_function)(); IntRange mytasks = Range(int(ntasks)).Split (mynode, num_nodes); try { while (1) { if (mynode_data.start_cnt >= mytasks.Size()) break; int mytask = mynode_data.start_cnt.fetch_add(1, memory_order_relaxed); if (mytask >= mytasks.Size()) break; ti.task_nr = mytasks.First()+mytask; ti.ntasks = ntasks; { RegionTracer t(ti.thread_nr, jobnr, RegionTracer::ID_JOB, ti.task_nr); (*func)(ti); } } } catch (Exception e) { { // cout << "got exception in TM" << endl; lock_guard<mutex> guard(copyex_mutex); delete ex; ex = new Exception (e); mynode_data.start_cnt = mytasks.Size(); } } #ifndef __MIC__ atomic_thread_fence (memory_order_release); #endif // __MIC__ if (cleanup_function) (*cleanup_function)(); jobdone = jobnr; mynode_data.participate-=2; { int oldpart = 1; if (mynode_data.participate.compare_exchange_strong (oldpart, 0)) { if (jobdone < jobnr.load()) { // reopen gate mynode_data.participate |= 1; } else { if (mynode != 0) mynode_data.start_cnt = 0; complete[mynode] = jobnr.load(); } } } } #ifdef USE_MKL mkl_set_num_threads_local(mkl_max); #endif workers_on_node[mynode]--; active_workers--; }