Ejemplo n.º 1
0
dd::GibbsSampling::GibbsSampling(FactorGraph * const _p_fg, 
  CmdParser * const _p_cmd_parser, int n_datacopy, bool sample_evidence,
  int burn_in, bool learn_non_evidence) 
  : p_fg(_p_fg), p_cmd_parser(_p_cmd_parser), sample_evidence(sample_evidence),
    burn_in(burn_in), learn_non_evidence(learn_non_evidence) {
    // the highest node number available
    n_numa_nodes = numa_max_node(); 

    // if n_datacopy is valid, use it, otherwise, use numa_max_node
    if (n_datacopy >= 1 && n_datacopy <= n_numa_nodes + 1) {
      n_numa_nodes = n_datacopy - 1;
    }

    // max possible threads per NUMA node
    n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_nodes+1);

    this->factorgraphs.push_back(*p_fg);

    // copy factor graphs
    for(int i=1;i<=n_numa_nodes;i++){

      numa_run_on_node(i);
      numa_set_localalloc();

      std::cout << "CREATE FG ON NODE ..." <<  i << std::endl;
      dd::FactorGraph fg(p_fg->n_var, p_fg->n_factor, p_fg->n_weight, p_fg->n_edge);
      
      fg.copy_from(p_fg);

      this->factorgraphs.push_back(fg);
    }
  };
Ejemplo n.º 2
0
void INTERNAL qt_affinity_set(qthread_worker_t *me,
                              unsigned int      Q_UNUSED(nw))
{                                      /*{{{ */
    assert(me);

    qthread_shepherd_t *const myshep = me->shepherd;

    /* It would be nice if we could do something more specific than
     * "numa_run_on_node", but because sched_etaffinity() is so dangerous, we
     * really can't, in good conscience. */
    qthread_debug(AFFINITY_FUNCTIONS, "calling numa_run_on_node(%i) for worker %i\n", myshep->node, me->packed_worker_id);
    int ret = numa_run_on_node(myshep->node);
    if (ret != 0) {
        numa_error("setting thread affinity");
	abort();
    }
    numa_set_localalloc();
}                                      /*}}} */
Ejemplo n.º 3
0
/* static */
void ThreadPool::setThreadNodeAffinity(int numaNode)
{
#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
    GROUP_AFFINITY groupAffinity;
    if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
    {
        if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask))
            return;
    }
    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
#elif HAVE_LIBNUMA
    if (numa_available() >= 0)
    {
        numa_run_on_node(numaNode);
        numa_set_preferred(numaNode);
        numa_set_localalloc();
        return;
    }
    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
#else
    (void)numaNode;
#endif
}
Ejemplo n.º 4
0
void gibbs(dd::CmdParser & cmd_parser){

  // number of NUMA nodes
  int n_numa_node = numa_max_node() + 1;
  // number of max threads per NUMA node
  int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node);

  // get command line arguments
  std::string fg_file = cmd_parser.fg_file->getValue();

  std::string weight_file = cmd_parser.weight_file->getValue();
  std::string variable_file = cmd_parser.variable_file->getValue();
  std::string factor_file = cmd_parser.factor_file->getValue();
  std::string edge_file = cmd_parser.edge_file->getValue();
  std::string meta_file = cmd_parser.meta_file->getValue();

  std::string output_folder = cmd_parser.output_folder->getValue();

  int n_learning_epoch = cmd_parser.n_learning_epoch->getValue();
  int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue();
  int n_inference_epoch = cmd_parser.n_inference_epoch->getValue();

  double stepsize = cmd_parser.stepsize->getValue();
  double stepsize2 = cmd_parser.stepsize2->getValue();
  // hack to support two parameters to specify step size
  if (stepsize == 0.01) stepsize = stepsize2;
  double decay = cmd_parser.decay->getValue();

  int n_datacopy = cmd_parser.n_datacopy->getValue();
  double reg_param = cmd_parser.reg_param->getValue();
  double reg1_param = cmd_parser.reg1_param->getValue();
  bool is_quiet = cmd_parser.quiet->getValue();
  bool sample_evidence = cmd_parser.sample_evidence->getValue();
  int burn_in = cmd_parser.burn_in->getValue();
  bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue();

  Meta meta = read_meta(fg_file); 

  if (is_quiet) {
    std::cout << "Running in quiet mode..." << std::endl;
  } else {
    std::cout << std::endl;
    std::cout << "#################MACHINE CONFIG#################" << std::endl;
    std::cout << "# # NUMA Node        : " << n_numa_node << std::endl;
    std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << std::endl;
    std::cout << "#################GIBBS SAMPLING#################" << std::endl;
    std::cout << "# fg_file            : " << fg_file << std::endl;
    std::cout << "# edge_file          : " << edge_file << std::endl;
    std::cout << "# weight_file        : " << weight_file << std::endl;
    std::cout << "# variable_file      : " << variable_file << std::endl;
    std::cout << "# factor_file        : " << factor_file << std::endl;
    std::cout << "# meta_file          : " << meta_file << std::endl;
    std::cout << "# output_folder      : " << output_folder << std::endl;
    std::cout << "# n_learning_epoch   : " << n_learning_epoch << std::endl;
    std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl;
    std::cout << "# n_inference_epoch  : " << n_inference_epoch << std::endl;
    std::cout << "# stepsize           : " << stepsize << std::endl;
    std::cout << "# decay              : " << decay << std::endl;
    std::cout << "# regularization     : " << reg_param << std::endl;
    std::cout << "# l1 regularization     : " << reg1_param << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl;
    std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl;
    std::cout << "################################################" << std::endl;


    std::cout << "# nvar               : " << meta.num_variables << std::endl;
    std::cout << "# nfac               : " << meta.num_factors << std::endl;
    std::cout << "# nweight            : " << meta.num_weights << std::endl;
    std::cout << "# nedge              : " << meta.num_edges << std::endl;
    std::cout << "################################################" << std::endl;
  }

  // run on NUMA node 0
  numa_run_on_node(0);
  numa_set_localalloc();

  // load factor graph
  dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges);
  fg.load(cmd_parser, is_quiet);
  dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence);

  // number of learning epochs
  // the factor graph is copied on each NUMA node, so the total epochs =
  // epochs specified / number of NUMA nodes
  int numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + 
                            (n_learning_epoch%n_numa_node==0?0:1);

  // learning
  /*gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
    stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/
  gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
              stepsize, decay, reg_param, reg1_param, is_quiet);

  // dump weights
  gibbs.dump_weights(is_quiet);

  // number of inference epochs
  int numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) + 
                            (n_inference_epoch%n_numa_node==0?0:1);

  // inference
  gibbs.inference(numa_aware_n_epoch, is_quiet);
  gibbs.aggregate_results_and_dump(is_quiet);


  // print weights from inference result
  for(long t=0;t<fg.n_weight;t++) {
      std::cout<<fg.infrs->weight_values[t]<<std::endl;
  }

  // print weights from factor graph
  std::cout<<"PRINTING WEIGHTS FROM WEIGHT VARIABLE"<<std::endl;
  for(long t=0;t<fg.n_weight;t++) {
      std::cout<<fg.weights[t].weight<<std::endl;
  }
}
Ejemplo n.º 5
0
void em(dd::CmdParser & cmd_parser){

  // number of NUMA nodes
  int n_numa_node = numa_max_node() + 1;
  // number of max threads per NUMA node
  int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node);

  // get command line arguments
  std::string fg_file = cmd_parser.fg_file->getValue();

  std::string weight_file = cmd_parser.weight_file->getValue();
  std::string variable_file = cmd_parser.variable_file->getValue();
  std::string factor_file = cmd_parser.factor_file->getValue();
  std::string edge_file = cmd_parser.edge_file->getValue();
  std::string meta_file = cmd_parser.meta_file->getValue();

  std::string output_folder = cmd_parser.output_folder->getValue();

  int n_learning_epoch = cmd_parser.n_learning_epoch->getValue();
  int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue();
  int n_inference_epoch = cmd_parser.n_inference_epoch->getValue();

  double stepsize = cmd_parser.stepsize->getValue();
  double stepsize2 = cmd_parser.stepsize2->getValue();
  // hack to support two parameters to specify step size
  if (stepsize == 0.01) stepsize = stepsize2;
  double decay = cmd_parser.decay->getValue();

  int n_datacopy = cmd_parser.n_datacopy->getValue();
  double reg_param = cmd_parser.reg_param->getValue();
  double reg1_param = cmd_parser.reg1_param->getValue();
  bool is_quiet = cmd_parser.quiet->getValue();
  bool check_convergence = cmd_parser.check_convergence->getValue();
  bool sample_evidence = cmd_parser.sample_evidence->getValue();
  int burn_in = cmd_parser.burn_in->getValue();
  int n_iter = cmd_parser.n_iter->getValue();
  int wl_conv = cmd_parser.wl_conv->getValue();
  int delta = cmd_parser.delta->getValue();
  bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue();

  Meta meta = read_meta(fg_file);

  if (is_quiet) {
    std::cout << "Running in quiet mode..." << std::endl;
  } else {
    std::cout << std::endl;
    std::cout << "#################MACHINE CONFIG#################" << std::endl;
    std::cout << "# # NUMA Node        : " << n_numa_node << std::endl;
    std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << std::endl;
    std::cout << "#################GIBBS SAMPLING#################" << std::endl;
    std::cout << "# fg_file            : " << fg_file << std::endl;
    std::cout << "# edge_file          : " << edge_file << std::endl;
    std::cout << "# weight_file        : " << weight_file << std::endl;
    std::cout << "# variable_file      : " << variable_file << std::endl;
    std::cout << "# factor_file        : " << factor_file << std::endl;
    std::cout << "# meta_file          : " << meta_file << std::endl;
    std::cout << "# output_folder      : " << output_folder << std::endl;
    std::cout << "# n_learning_epoch   : " << n_learning_epoch << std::endl;
    std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl;
    std::cout << "# n_inference_epoch  : " << n_inference_epoch << std::endl;
    std::cout << "# stepsize           : " << stepsize << std::endl;
    std::cout << "# decay              : " << decay << std::endl;
    std::cout << "# regularization     : " << reg_param << std::endl;
    std::cout << "# l1 regularization     : " << reg1_param << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl;
    std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl;
    std::cout << "################################################" << std::endl;


    std::cout << "# nvar               : " << meta.num_variables << std::endl;
    std::cout << "# nfac               : " << meta.num_factors << std::endl;
    std::cout << "# nweight            : " << meta.num_weights << std::endl;
    std::cout << "# nedge              : " << meta.num_edges << std::endl;
    std::cout << "################################################" << std::endl;
  }

  // run on NUMA node 0
  numa_run_on_node(0);
  numa_set_localalloc();

  // load factor graph
  dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges);
  fg.load(cmd_parser, is_quiet);
  dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence);

  // Initialize EM instance
  dd::ExpMax expMax(&fg, &gibbs, wl_conv, delta, check_convergence);

  // number of inference epochs
  int numa_aware_n_epoch;
  int numa_aware_n_learning_epoch;

  // EM init -- run Maximzation (semi-supervised learning)

  // Maximization step
  numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) +
                                (n_learning_epoch%n_numa_node==0?0:1);
  expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                      stepsize, decay, reg_param, reg1_param, is_quiet);
  /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                      stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/

  while (!expMax.hasConverged && n_iter > 0) {

    // Expectation step
    numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) +
                         (n_inference_epoch%n_numa_node==0?0:1);
    expMax.expectation(numa_aware_n_epoch,is_quiet);


    // Maximization step
    numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) +
                                  (n_learning_epoch%n_numa_node==0?0:1);
    /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                        stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/
    expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                        stepsize, decay, reg_param, reg1_param, is_quiet);

    //Decrement iteration counter
    n_iter--;
  }

  expMax.dump_weights(is_quiet);
  expMax.aggregate_results_and_dump(is_quiet);


}
Ejemplo n.º 6
0
/*
 * Class:     xerial_jnuma_NumaNative
 * Method:    setLocalAlloc
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_setLocalAlloc
    (JNIEnv *env, jobject obj) {
  numa_set_localalloc();
}
Ejemplo n.º 7
0
/**
   Closed loop simulation main loop. It calls init_simu() to initialize the
   simulation struct. Then calls genatm() to generate atmospheric turbulence
   screens. Then for every time step, it calls perfevl() to evaluate
   performance, wfsgrad() to do wfs measurement, reconstruct() to do tomography
   and DM fit, filter() to do DM command filtering. In MOAO mode, it call calls
   moao_recon() for MOAO DM fitting.  \callgraph */
void maos_sim(){
    const PARMS_T *parms=global->parms;
    POWFS_T *powfs=global->powfs;
    RECON_T *recon=global->recon;
    APER_T *aper=global->aper;
    int simend=parms->sim.end;
    int simstart=parms->sim.start;
    if(parms->sim.skysim){
	save_skyc(powfs,recon,parms);
    }
    if(parms->evl.psfmean || parms->evl.psfhist){
	/*compute diffraction limited PSF. Save to output directory.*/
	dmat *iopdevl=dnew(aper->locs->nloc,1);
	ccell *psf2s=0;
	locfft_psf(&psf2s, aper->embed, iopdevl, parms->evl.psfsize, 0);
	const int nwvl=parms->evl.nwvl;
	dcell *evlpsfdl=dcellnew(nwvl,1);
	for(int iwvl=0; iwvl<nwvl; iwvl++){
	    cabs22d(&evlpsfdl->p[iwvl], 1, psf2s->p[iwvl], 1);
	    evlpsfdl->p[iwvl]->header=evl_header(parms, aper, -1, iwvl);
	}
	ccellfree(psf2s);
	writebin(evlpsfdl, "evlpsfdl.fits");
	dcellfree(evlpsfdl);
	dfree(iopdevl);
    }
    info2("PARALLEL=%d\n", PARALLEL);
    if(simstart>=simend) return;
    double restot=0; long rescount=0;
    for(int iseed=0; iseed<parms->sim.nseed; iseed++){
	SIM_T *simu=0;
	while(!(simu=maos_iseed(iseed))){
	    iseed++;
	}
#ifdef HAVE_NUMA_H
	numa_set_localalloc();
#endif
	for(int isim=simstart; isim<simend; isim++){
	    maos_isim(isim);
	    if(parms->sim.pause){
		mypause();
	    }
	}/*isim */
	{
	    /*Compute average performance*/
	    int isim0;
	    if(parms->sim.closeloop){
		if(parms->sim.end>100){
		    isim0=MAX(50,parms->sim.end/10);
		}else{
		    isim0=MIN(20, parms->sim.end/2);
		}
	    }else{
		isim0=0;
	    }
	    double sum=0;
	    for(int i=isim0; i<parms->sim.end; i++){
		sum+=simu->cle->p[i*parms->evl.nmod];
	    }
	    restot+=sum/(parms->sim.end-isim0);
	    rescount++;
	}
	free_simu(simu);
	global->simu=0;
    }/*seed */
    printf("%g\n", sqrt(restot/rescount)*1e9);
}
Ejemplo n.º 8
0
int main(int ac, char **av)
{
	int c, i, nnodes=0;
	long node=-1;
	char *end;
	char shortopts[array_len(opts)*2 + 1];
	struct bitmask *mask = NULL;

	get_short_opts(opts,shortopts);
	while ((c = getopt_long(ac, av, shortopts, opts, NULL)) != -1) {
		switch (c) {
		case 's': /* --show */
			show();
			exit(0);
		case 'H': /* --hardware */
			nopolicy();
			hardware();
			exit(0);
		case 'i': /* --interleave */
			checknuma();
			mask = numactl_parse_nodestring(optarg);
			if (!mask) {
				printf ("<%s> is invalid\n", optarg);
				usage();
			}

			errno = 0;
			setpolicy(MPOL_INTERLEAVE);
			if (shmfd >= 0)
				numa_interleave_memory(shmptr, shmlen, mask);
			else
				numa_set_interleave_mask(mask);
			checkerror("setting interleave mask");
			break;
		case 'N': /* --cpunodebind */
		case 'c': /* --cpubind */
			dontshm("-c/--cpubind/--cpunodebind");
			checknuma();
			mask = numactl_parse_nodestring(optarg);
			if (!mask) {
				printf ("<%s> is invalid\n", optarg);
				usage();
			}
			errno = 0;
			check_cpubind(do_shm);
			did_cpubind = 1;
			numa_run_on_node_mask(mask);
			checkerror("sched_setaffinity");
			break;
		case 'C': /* --physcpubind */
		{
			struct bitmask *cpubuf;
			dontshm("-C/--physcpubind");
			cpubuf = numa_parse_cpustring(optarg);
			if (!cpubuf) {
				printf ("<%s> is invalid\n", optarg);
				usage();
			}
			errno = 0;
			check_cpubind(do_shm);
			did_cpubind = 1;
			numa_sched_setaffinity(0, cpubuf);
			checkerror("sched_setaffinity");
			free(cpubuf);
			break;
		}
		case 'm': /* --membind */
			checknuma();
			setpolicy(MPOL_BIND);
			mask = numactl_parse_nodestring(optarg);
			if (!mask) {
				printf ("<%s> is invalid\n", optarg);
				usage();
			}
			errno = 0;
			numa_set_bind_policy(1);
			if (shmfd >= 0) {
				numa_tonodemask_memory(shmptr, shmlen, mask);
			} else {
				numa_set_membind(mask);
			}
			numa_set_bind_policy(0);
			checkerror("setting membind");
			break;
		case 'p': /* --preferred */
			checknuma();
			setpolicy(MPOL_PREFERRED);
			mask = numactl_parse_nodestring(optarg);
			if (!mask) {
				printf ("<%s> is invalid\n", optarg);
				usage();
			}
			for (i=0; i<mask->size; i++) {
				if (numa_bitmask_isbitset(mask, i)) {
					node = i;
					nnodes++;
				}
			}
			if (nnodes != 1)
				usage();
			numa_bitmask_free(mask);
			errno = 0;
			numa_set_bind_policy(0);
			if (shmfd >= 0)
				numa_tonode_memory(shmptr, shmlen, node);
			else
				numa_set_preferred(node);
			checkerror("setting preferred node");
			break;
		case 'l': /* --local */
			checknuma();
			setpolicy(MPOL_DEFAULT);
			errno = 0;
			if (shmfd >= 0)
				numa_setlocal_memory(shmptr, shmlen);
			else
				numa_set_localalloc();
			checkerror("local allocation");
			break;
		case 'S': /* --shm */
			check_cpubind(did_cpubind);
			nopolicy();
			attach_sysvshm(optarg, "--shm");
			shmattached = 1;
			break;
		case 'f': /* --file */
			check_cpubind(did_cpubind);
			nopolicy();
			attach_shared(optarg, "--file");
			shmattached = 1;
			break;
		case 'L': /* --length */
			noshm("--length");
			shmlen = memsize(optarg);
			break;
		case 'M': /* --shmmode */
			noshm("--shmmode");
			shmmode = strtoul(optarg, &end, 8);
			if (end == optarg || *end)
				usage();
			break;
		case 'd': /* --dump */
			if (shmfd < 0)
				complain(
				"Cannot do --dump without shared memory.\n");
			dump_shm();
			do_dump = 1;
			break;
		case 'D': /* --dump-nodes */
			if (shmfd < 0)
				complain(
			    "Cannot do --dump-nodes without shared memory.\n");
			dump_shm_nodes();
			do_dump = 1;
			break;
		case 't': /* --strict */
			did_strict = 1;
			numa_set_strict(1);
			break;
		case 'I': /* --shmid */
			shmid = strtoul(optarg, &end, 0);
			if (end == optarg || *end)
				usage();
			break;

		case 'u': /* --huge */
			noshm("--huge");
			shmflags |= SHM_HUGETLB;
			break;

		case 'o':  /* --offset */
			noshm("--offset");
			shmoffset = memsize(optarg);
			break;			

		case 'T': /* --touch */
			needshm("--touch");
			check_shmbeyond("--touch");
			numa_police_memory(shmptr, shmlen);
			break;

		case 'V': /* --verify */
			needshm("--verify");
			if (set_policy < 0)
				complain("Need a policy first to verify");
			check_shmbeyond("--verify");
			numa_police_memory(shmptr, shmlen);
			if (!mask)
				complain("Need a mask to verify");
			else
				verify_shm(set_policy, mask);
			break;

		default:
			usage();
		}
	}

	av += optind;
	ac -= optind;

	if (shmfd >= 0) {
		if (*av)
			usage();
		exit(exitcode);
	}

	if (did_strict)
		fprintf(stderr,
			"numactl: warning. Strict flag for process ignored.\n");

	if (do_dump)
		usage_msg("cannot do --dump|--dump-shm for process");

	if (shmoption)
		usage_msg("shm related option %s for process", shmoption);
	
	if (*av == NULL)
		usage();
	execvp(*av, av);
	complain("execution of `%s': %s\n", av[0], strerror(errno));
	return 0; /* not reached */
}
Ejemplo n.º 9
0
int main(int argc, const char **argv)
{
  int num_cpus = numa_num_task_cpus();
  printf("num cpus: %d\n", num_cpus);

  printf("numa available: %d\n", numa_available());
  numa_set_localalloc();

  struct bitmask *bm = numa_bitmask_alloc(num_cpus);
  for (int i=0; i<=numa_max_node(); ++i)
  {
    numa_node_to_cpus(i, bm);
    printf("numa node %d ", i);
    print_bitmask(bm);
    printf(" - %g GiB\n", numa_node_size(i, 0) / (1024.*1024*1024.));
  }
  numa_bitmask_free(bm);

  puts("");

  char *x;
  const size_t cache_line_size = 64;
  const size_t array_size = 100*1000*1000;
  size_t ntrips = 2;

#pragma omp parallel
  {
    assert(omp_get_num_threads() == num_cpus);
    int tid = omp_get_thread_num();

    pin_to_core(tid);
    if(tid == 0)
      x = (char *) numa_alloc_local(array_size);

    // {{{ single access
#pragma omp barrier
    for (size_t i = 0; i<num_cpus; ++i)
    {
      if (tid == i)
      {
        double t = measure_access(x, array_size, ntrips);
        printf("sequential core %d -> core 0 : BW %g MB/s\n",
            i, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
    }
    // }}}

    // {{{ everybody contends for one

    {
      if (tid == 0) puts("");

#pragma omp barrier
      double t = measure_access(x, array_size, ntrips);
#pragma omp barrier
      for (size_t i = 0; i<num_cpus; ++i)
      {
        if (tid == i)
          printf("all-contention core %d -> core 0 : BW %g MB/s\n",
              tid, array_size*ntrips*cache_line_size / t / 1e6);
#pragma omp barrier
      }
    }

    // }}}

    // {{{ zero and someone else contending

    if (tid == 0) puts("");

#pragma omp barrier
    for (size_t i = 1; i<num_cpus; ++i)
    {
      double t;
      if (tid == i || tid == 0)
        t = measure_access(x, array_size, ntrips);

#pragma omp barrier
      if (tid == 0)
      {
        printf("two-contention core %d -> core 0 : BW %g MB/s\n",
            tid, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
      if (tid == i)
      {
        printf("two-contention core %d -> core 0 : BW %g MB/s\n\n",
            tid, array_size*ntrips*cache_line_size / t / 1e6);
      }
#pragma omp barrier
    }
  }
  numa_free(x, array_size);

  return 0;
}