dd::GibbsSampling::GibbsSampling(FactorGraph * const _p_fg, CmdParser * const _p_cmd_parser, int n_datacopy, bool sample_evidence, int burn_in, bool learn_non_evidence) : p_fg(_p_fg), p_cmd_parser(_p_cmd_parser), sample_evidence(sample_evidence), burn_in(burn_in), learn_non_evidence(learn_non_evidence) { // the highest node number available n_numa_nodes = numa_max_node(); // if n_datacopy is valid, use it, otherwise, use numa_max_node if (n_datacopy >= 1 && n_datacopy <= n_numa_nodes + 1) { n_numa_nodes = n_datacopy - 1; } // max possible threads per NUMA node n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_nodes+1); this->factorgraphs.push_back(*p_fg); // copy factor graphs for(int i=1;i<=n_numa_nodes;i++){ numa_run_on_node(i); numa_set_localalloc(); std::cout << "CREATE FG ON NODE ..." << i << std::endl; dd::FactorGraph fg(p_fg->n_var, p_fg->n_factor, p_fg->n_weight, p_fg->n_edge); fg.copy_from(p_fg); this->factorgraphs.push_back(fg); } };
void INTERNAL qt_affinity_set(qthread_worker_t *me, unsigned int Q_UNUSED(nw)) { /*{{{ */ assert(me); qthread_shepherd_t *const myshep = me->shepherd; /* It would be nice if we could do something more specific than * "numa_run_on_node", but because sched_etaffinity() is so dangerous, we * really can't, in good conscience. */ qthread_debug(AFFINITY_FUNCTIONS, "calling numa_run_on_node(%i) for worker %i\n", myshep->node, me->packed_worker_id); int ret = numa_run_on_node(myshep->node); if (ret != 0) { numa_error("setting thread affinity"); abort(); } numa_set_localalloc(); } /*}}} */
/* static */ void ThreadPool::setThreadNodeAffinity(int numaNode) { #if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 GROUP_AFFINITY groupAffinity; if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity)) { if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask)) return; } x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode); #elif HAVE_LIBNUMA if (numa_available() >= 0) { numa_run_on_node(numaNode); numa_set_preferred(numaNode); numa_set_localalloc(); return; } x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode); #else (void)numaNode; #endif }
void gibbs(dd::CmdParser & cmd_parser){ // number of NUMA nodes int n_numa_node = numa_max_node() + 1; // number of max threads per NUMA node int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node); // get command line arguments std::string fg_file = cmd_parser.fg_file->getValue(); std::string weight_file = cmd_parser.weight_file->getValue(); std::string variable_file = cmd_parser.variable_file->getValue(); std::string factor_file = cmd_parser.factor_file->getValue(); std::string edge_file = cmd_parser.edge_file->getValue(); std::string meta_file = cmd_parser.meta_file->getValue(); std::string output_folder = cmd_parser.output_folder->getValue(); int n_learning_epoch = cmd_parser.n_learning_epoch->getValue(); int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue(); int n_inference_epoch = cmd_parser.n_inference_epoch->getValue(); double stepsize = cmd_parser.stepsize->getValue(); double stepsize2 = cmd_parser.stepsize2->getValue(); // hack to support two parameters to specify step size if (stepsize == 0.01) stepsize = stepsize2; double decay = cmd_parser.decay->getValue(); int n_datacopy = cmd_parser.n_datacopy->getValue(); double reg_param = cmd_parser.reg_param->getValue(); double reg1_param = cmd_parser.reg1_param->getValue(); bool is_quiet = cmd_parser.quiet->getValue(); bool sample_evidence = cmd_parser.sample_evidence->getValue(); int burn_in = cmd_parser.burn_in->getValue(); bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue(); Meta meta = read_meta(fg_file); if (is_quiet) { std::cout << "Running in quiet mode..." << std::endl; } else { std::cout << std::endl; std::cout << "#################MACHINE CONFIG#################" << std::endl; std::cout << "# # NUMA Node : " << n_numa_node << std::endl; std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl; std::cout << "################################################" << std::endl; std::cout << std::endl; std::cout << "#################GIBBS SAMPLING#################" << std::endl; std::cout << "# fg_file : " << fg_file << std::endl; std::cout << "# edge_file : " << edge_file << std::endl; std::cout << "# weight_file : " << weight_file << std::endl; std::cout << "# variable_file : " << variable_file << std::endl; std::cout << "# factor_file : " << factor_file << std::endl; std::cout << "# meta_file : " << meta_file << std::endl; std::cout << "# output_folder : " << output_folder << std::endl; std::cout << "# n_learning_epoch : " << n_learning_epoch << std::endl; std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl; std::cout << "# n_inference_epoch : " << n_inference_epoch << std::endl; std::cout << "# stepsize : " << stepsize << std::endl; std::cout << "# decay : " << decay << std::endl; std::cout << "# regularization : " << reg_param << std::endl; std::cout << "# l1 regularization : " << reg1_param << std::endl; std::cout << "################################################" << std::endl; std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl; std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl; std::cout << "################################################" << std::endl; std::cout << "# nvar : " << meta.num_variables << std::endl; std::cout << "# nfac : " << meta.num_factors << std::endl; std::cout << "# nweight : " << meta.num_weights << std::endl; std::cout << "# nedge : " << meta.num_edges << std::endl; std::cout << "################################################" << std::endl; } // run on NUMA node 0 numa_run_on_node(0); numa_set_localalloc(); // load factor graph dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges); fg.load(cmd_parser, is_quiet); dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence); // number of learning epochs // the factor graph is copied on each NUMA node, so the total epochs = // epochs specified / number of NUMA nodes int numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + (n_learning_epoch%n_numa_node==0?0:1); // learning /*gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/ gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, is_quiet); // dump weights gibbs.dump_weights(is_quiet); // number of inference epochs int numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) + (n_inference_epoch%n_numa_node==0?0:1); // inference gibbs.inference(numa_aware_n_epoch, is_quiet); gibbs.aggregate_results_and_dump(is_quiet); // print weights from inference result for(long t=0;t<fg.n_weight;t++) { std::cout<<fg.infrs->weight_values[t]<<std::endl; } // print weights from factor graph std::cout<<"PRINTING WEIGHTS FROM WEIGHT VARIABLE"<<std::endl; for(long t=0;t<fg.n_weight;t++) { std::cout<<fg.weights[t].weight<<std::endl; } }
void em(dd::CmdParser & cmd_parser){ // number of NUMA nodes int n_numa_node = numa_max_node() + 1; // number of max threads per NUMA node int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node); // get command line arguments std::string fg_file = cmd_parser.fg_file->getValue(); std::string weight_file = cmd_parser.weight_file->getValue(); std::string variable_file = cmd_parser.variable_file->getValue(); std::string factor_file = cmd_parser.factor_file->getValue(); std::string edge_file = cmd_parser.edge_file->getValue(); std::string meta_file = cmd_parser.meta_file->getValue(); std::string output_folder = cmd_parser.output_folder->getValue(); int n_learning_epoch = cmd_parser.n_learning_epoch->getValue(); int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue(); int n_inference_epoch = cmd_parser.n_inference_epoch->getValue(); double stepsize = cmd_parser.stepsize->getValue(); double stepsize2 = cmd_parser.stepsize2->getValue(); // hack to support two parameters to specify step size if (stepsize == 0.01) stepsize = stepsize2; double decay = cmd_parser.decay->getValue(); int n_datacopy = cmd_parser.n_datacopy->getValue(); double reg_param = cmd_parser.reg_param->getValue(); double reg1_param = cmd_parser.reg1_param->getValue(); bool is_quiet = cmd_parser.quiet->getValue(); bool check_convergence = cmd_parser.check_convergence->getValue(); bool sample_evidence = cmd_parser.sample_evidence->getValue(); int burn_in = cmd_parser.burn_in->getValue(); int n_iter = cmd_parser.n_iter->getValue(); int wl_conv = cmd_parser.wl_conv->getValue(); int delta = cmd_parser.delta->getValue(); bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue(); Meta meta = read_meta(fg_file); if (is_quiet) { std::cout << "Running in quiet mode..." << std::endl; } else { std::cout << std::endl; std::cout << "#################MACHINE CONFIG#################" << std::endl; std::cout << "# # NUMA Node : " << n_numa_node << std::endl; std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl; std::cout << "################################################" << std::endl; std::cout << std::endl; std::cout << "#################GIBBS SAMPLING#################" << std::endl; std::cout << "# fg_file : " << fg_file << std::endl; std::cout << "# edge_file : " << edge_file << std::endl; std::cout << "# weight_file : " << weight_file << std::endl; std::cout << "# variable_file : " << variable_file << std::endl; std::cout << "# factor_file : " << factor_file << std::endl; std::cout << "# meta_file : " << meta_file << std::endl; std::cout << "# output_folder : " << output_folder << std::endl; std::cout << "# n_learning_epoch : " << n_learning_epoch << std::endl; std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl; std::cout << "# n_inference_epoch : " << n_inference_epoch << std::endl; std::cout << "# stepsize : " << stepsize << std::endl; std::cout << "# decay : " << decay << std::endl; std::cout << "# regularization : " << reg_param << std::endl; std::cout << "# l1 regularization : " << reg1_param << std::endl; std::cout << "################################################" << std::endl; std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl; std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl; std::cout << "################################################" << std::endl; std::cout << "# nvar : " << meta.num_variables << std::endl; std::cout << "# nfac : " << meta.num_factors << std::endl; std::cout << "# nweight : " << meta.num_weights << std::endl; std::cout << "# nedge : " << meta.num_edges << std::endl; std::cout << "################################################" << std::endl; } // run on NUMA node 0 numa_run_on_node(0); numa_set_localalloc(); // load factor graph dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges); fg.load(cmd_parser, is_quiet); dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence); // Initialize EM instance dd::ExpMax expMax(&fg, &gibbs, wl_conv, delta, check_convergence); // number of inference epochs int numa_aware_n_epoch; int numa_aware_n_learning_epoch; // EM init -- run Maximzation (semi-supervised learning) // Maximization step numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + (n_learning_epoch%n_numa_node==0?0:1); expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, is_quiet); /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/ while (!expMax.hasConverged && n_iter > 0) { // Expectation step numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) + (n_inference_epoch%n_numa_node==0?0:1); expMax.expectation(numa_aware_n_epoch,is_quiet); // Maximization step numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + (n_learning_epoch%n_numa_node==0?0:1); /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/ expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch, stepsize, decay, reg_param, reg1_param, is_quiet); //Decrement iteration counter n_iter--; } expMax.dump_weights(is_quiet); expMax.aggregate_results_and_dump(is_quiet); }
/* * Class: xerial_jnuma_NumaNative * Method: setLocalAlloc * Signature: ()V */ JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_setLocalAlloc (JNIEnv *env, jobject obj) { numa_set_localalloc(); }
/** Closed loop simulation main loop. It calls init_simu() to initialize the simulation struct. Then calls genatm() to generate atmospheric turbulence screens. Then for every time step, it calls perfevl() to evaluate performance, wfsgrad() to do wfs measurement, reconstruct() to do tomography and DM fit, filter() to do DM command filtering. In MOAO mode, it call calls moao_recon() for MOAO DM fitting. \callgraph */ void maos_sim(){ const PARMS_T *parms=global->parms; POWFS_T *powfs=global->powfs; RECON_T *recon=global->recon; APER_T *aper=global->aper; int simend=parms->sim.end; int simstart=parms->sim.start; if(parms->sim.skysim){ save_skyc(powfs,recon,parms); } if(parms->evl.psfmean || parms->evl.psfhist){ /*compute diffraction limited PSF. Save to output directory.*/ dmat *iopdevl=dnew(aper->locs->nloc,1); ccell *psf2s=0; locfft_psf(&psf2s, aper->embed, iopdevl, parms->evl.psfsize, 0); const int nwvl=parms->evl.nwvl; dcell *evlpsfdl=dcellnew(nwvl,1); for(int iwvl=0; iwvl<nwvl; iwvl++){ cabs22d(&evlpsfdl->p[iwvl], 1, psf2s->p[iwvl], 1); evlpsfdl->p[iwvl]->header=evl_header(parms, aper, -1, iwvl); } ccellfree(psf2s); writebin(evlpsfdl, "evlpsfdl.fits"); dcellfree(evlpsfdl); dfree(iopdevl); } info2("PARALLEL=%d\n", PARALLEL); if(simstart>=simend) return; double restot=0; long rescount=0; for(int iseed=0; iseed<parms->sim.nseed; iseed++){ SIM_T *simu=0; while(!(simu=maos_iseed(iseed))){ iseed++; } #ifdef HAVE_NUMA_H numa_set_localalloc(); #endif for(int isim=simstart; isim<simend; isim++){ maos_isim(isim); if(parms->sim.pause){ mypause(); } }/*isim */ { /*Compute average performance*/ int isim0; if(parms->sim.closeloop){ if(parms->sim.end>100){ isim0=MAX(50,parms->sim.end/10); }else{ isim0=MIN(20, parms->sim.end/2); } }else{ isim0=0; } double sum=0; for(int i=isim0; i<parms->sim.end; i++){ sum+=simu->cle->p[i*parms->evl.nmod]; } restot+=sum/(parms->sim.end-isim0); rescount++; } free_simu(simu); global->simu=0; }/*seed */ printf("%g\n", sqrt(restot/rescount)*1e9); }
int main(int ac, char **av) { int c, i, nnodes=0; long node=-1; char *end; char shortopts[array_len(opts)*2 + 1]; struct bitmask *mask = NULL; get_short_opts(opts,shortopts); while ((c = getopt_long(ac, av, shortopts, opts, NULL)) != -1) { switch (c) { case 's': /* --show */ show(); exit(0); case 'H': /* --hardware */ nopolicy(); hardware(); exit(0); case 'i': /* --interleave */ checknuma(); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; setpolicy(MPOL_INTERLEAVE); if (shmfd >= 0) numa_interleave_memory(shmptr, shmlen, mask); else numa_set_interleave_mask(mask); checkerror("setting interleave mask"); break; case 'N': /* --cpunodebind */ case 'c': /* --cpubind */ dontshm("-c/--cpubind/--cpunodebind"); checknuma(); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; check_cpubind(do_shm); did_cpubind = 1; numa_run_on_node_mask(mask); checkerror("sched_setaffinity"); break; case 'C': /* --physcpubind */ { struct bitmask *cpubuf; dontshm("-C/--physcpubind"); cpubuf = numa_parse_cpustring(optarg); if (!cpubuf) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; check_cpubind(do_shm); did_cpubind = 1; numa_sched_setaffinity(0, cpubuf); checkerror("sched_setaffinity"); free(cpubuf); break; } case 'm': /* --membind */ checknuma(); setpolicy(MPOL_BIND); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } errno = 0; numa_set_bind_policy(1); if (shmfd >= 0) { numa_tonodemask_memory(shmptr, shmlen, mask); } else { numa_set_membind(mask); } numa_set_bind_policy(0); checkerror("setting membind"); break; case 'p': /* --preferred */ checknuma(); setpolicy(MPOL_PREFERRED); mask = numactl_parse_nodestring(optarg); if (!mask) { printf ("<%s> is invalid\n", optarg); usage(); } for (i=0; i<mask->size; i++) { if (numa_bitmask_isbitset(mask, i)) { node = i; nnodes++; } } if (nnodes != 1) usage(); numa_bitmask_free(mask); errno = 0; numa_set_bind_policy(0); if (shmfd >= 0) numa_tonode_memory(shmptr, shmlen, node); else numa_set_preferred(node); checkerror("setting preferred node"); break; case 'l': /* --local */ checknuma(); setpolicy(MPOL_DEFAULT); errno = 0; if (shmfd >= 0) numa_setlocal_memory(shmptr, shmlen); else numa_set_localalloc(); checkerror("local allocation"); break; case 'S': /* --shm */ check_cpubind(did_cpubind); nopolicy(); attach_sysvshm(optarg, "--shm"); shmattached = 1; break; case 'f': /* --file */ check_cpubind(did_cpubind); nopolicy(); attach_shared(optarg, "--file"); shmattached = 1; break; case 'L': /* --length */ noshm("--length"); shmlen = memsize(optarg); break; case 'M': /* --shmmode */ noshm("--shmmode"); shmmode = strtoul(optarg, &end, 8); if (end == optarg || *end) usage(); break; case 'd': /* --dump */ if (shmfd < 0) complain( "Cannot do --dump without shared memory.\n"); dump_shm(); do_dump = 1; break; case 'D': /* --dump-nodes */ if (shmfd < 0) complain( "Cannot do --dump-nodes without shared memory.\n"); dump_shm_nodes(); do_dump = 1; break; case 't': /* --strict */ did_strict = 1; numa_set_strict(1); break; case 'I': /* --shmid */ shmid = strtoul(optarg, &end, 0); if (end == optarg || *end) usage(); break; case 'u': /* --huge */ noshm("--huge"); shmflags |= SHM_HUGETLB; break; case 'o': /* --offset */ noshm("--offset"); shmoffset = memsize(optarg); break; case 'T': /* --touch */ needshm("--touch"); check_shmbeyond("--touch"); numa_police_memory(shmptr, shmlen); break; case 'V': /* --verify */ needshm("--verify"); if (set_policy < 0) complain("Need a policy first to verify"); check_shmbeyond("--verify"); numa_police_memory(shmptr, shmlen); if (!mask) complain("Need a mask to verify"); else verify_shm(set_policy, mask); break; default: usage(); } } av += optind; ac -= optind; if (shmfd >= 0) { if (*av) usage(); exit(exitcode); } if (did_strict) fprintf(stderr, "numactl: warning. Strict flag for process ignored.\n"); if (do_dump) usage_msg("cannot do --dump|--dump-shm for process"); if (shmoption) usage_msg("shm related option %s for process", shmoption); if (*av == NULL) usage(); execvp(*av, av); complain("execution of `%s': %s\n", av[0], strerror(errno)); return 0; /* not reached */ }
int main(int argc, const char **argv) { int num_cpus = numa_num_task_cpus(); printf("num cpus: %d\n", num_cpus); printf("numa available: %d\n", numa_available()); numa_set_localalloc(); struct bitmask *bm = numa_bitmask_alloc(num_cpus); for (int i=0; i<=numa_max_node(); ++i) { numa_node_to_cpus(i, bm); printf("numa node %d ", i); print_bitmask(bm); printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.)); } numa_bitmask_free(bm); puts(""); char *x; const size_t cache_line_size = 64; const size_t array_size = 100*1000*1000; size_t ntrips = 2; #pragma omp parallel { assert(omp_get_num_threads() == num_cpus); int tid = omp_get_thread_num(); pin_to_core(tid); if(tid == 0) x = (char *) numa_alloc_local(array_size); // {{{ single access #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) { double t = measure_access(x, array_size, ntrips); printf("sequential core %d -> core 0 : BW %g MB/s\n", i, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } // }}} // {{{ everybody contends for one { if (tid == 0) puts(""); #pragma omp barrier double t = measure_access(x, array_size, ntrips); #pragma omp barrier for (size_t i = 0; i<num_cpus; ++i) { if (tid == i) printf("all-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); #pragma omp barrier } } // }}} // {{{ zero and someone else contending if (tid == 0) puts(""); #pragma omp barrier for (size_t i = 1; i<num_cpus; ++i) { double t; if (tid == i || tid == 0) t = measure_access(x, array_size, ntrips); #pragma omp barrier if (tid == 0) { printf("two-contention core %d -> core 0 : BW %g MB/s\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier if (tid == i) { printf("two-contention core %d -> core 0 : BW %g MB/s\n\n", tid, array_size*ntrips*cache_line_size / t / 1e6); } #pragma omp barrier } } numa_free(x, array_size); return 0; }