Example #1
0
  int EnterTaskManager ()
  {
    if (task_manager)
      {
        // no task manager started
        return 0;
      }

    task_manager = new TaskManager();

    cout << IM(3) << "task-based parallelization (C++11 threads) using "<< task_manager->GetNumThreads() << " threads" << endl;

#ifdef USE_NUMA
    numa_run_on_node (0);
#endif

#ifndef WIN32
    // master has maximal priority !
    int policy;
    struct sched_param param;
    pthread_getschedparam(pthread_self(), &policy, &param);
    param.sched_priority = sched_get_priority_max(policy);
    pthread_setschedparam(pthread_self(), policy, &param);
#endif // WIN32

    
    task_manager->StartWorkers();

    ParallelFor (Range(100), [&] (int i) { ; });    // startup
    return task_manager->GetNumThreads();
  }
int main(int argc, char** argv)
{
	if(argc < 3)
	{	
		std::cout << "Usage:" << std::endl
			  << "gland <localNodeId> <remoteNodeId> [--human]" <<std::endl;
		exit(EXIT_FAILURE);
	}
	if(argc == 4 && strncmp(argv[3],"--human",7) == 0)
	{
		std::cout << "enabling human readable output" << std::endl;
		humanreadable = true;
	}

	size_t localNode = std::stoi(argv[1]);
	size_t remoteNode = std::stoi(argv[2]);
	
	numa_run_on_node(localNode);
	
	double *a = (double*) numa_alloc_onnode( N * sizeof(double), localNode );
	double *b = (double*) numa_alloc_onnode( N * sizeof(double), remoteNode );

	for(int i = 0; i<N ; i++)
	{
		a[i]=(double)i;
		b[i]=(double)-i;
	}

	while(1) memcpy_task(a, b);
}
Example #3
0
dd::GibbsSampling::GibbsSampling(FactorGraph * const _p_fg, 
  CmdParser * const _p_cmd_parser, int n_datacopy, bool sample_evidence,
  int burn_in, bool learn_non_evidence) 
  : p_fg(_p_fg), p_cmd_parser(_p_cmd_parser), sample_evidence(sample_evidence),
    burn_in(burn_in), learn_non_evidence(learn_non_evidence) {
    // the highest node number available
    n_numa_nodes = numa_max_node(); 

    // if n_datacopy is valid, use it, otherwise, use numa_max_node
    if (n_datacopy >= 1 && n_datacopy <= n_numa_nodes + 1) {
      n_numa_nodes = n_datacopy - 1;
    }

    // max possible threads per NUMA node
    n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_nodes+1);

    this->factorgraphs.push_back(*p_fg);

    // copy factor graphs
    for(int i=1;i<=n_numa_nodes;i++){

      numa_run_on_node(i);
      numa_set_localalloc();

      std::cout << "CREATE FG ON NODE ..." <<  i << std::endl;
      dd::FactorGraph fg(p_fg->n_var, p_fg->n_factor, p_fg->n_weight, p_fg->n_edge);
      
      fg.copy_from(p_fg);

      this->factorgraphs.push_back(fg);
    }
  };
Example #4
0
static void send_from_node(int node_id, int family, int proto)
{
	struct sockaddr_storage saddr, daddr;
	struct sockaddr_in  *saddr4, *daddr4;
	struct sockaddr_in6 *saddr6, *daddr6;
	int fd;

	switch (family) {
	case AF_INET:
		saddr4 = (struct sockaddr_in *)&saddr;
		saddr4->sin_family = AF_INET;
		saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
		saddr4->sin_port = 0;

		daddr4 = (struct sockaddr_in *)&daddr;
		daddr4->sin_family = AF_INET;
		daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		daddr4->sin_port = htons(PORT);
		break;
	case AF_INET6:
		saddr6 = (struct sockaddr_in6 *)&saddr;
		saddr6->sin6_family = AF_INET6;
		saddr6->sin6_addr = in6addr_any;
		saddr6->sin6_port = 0;

		daddr6 = (struct sockaddr_in6 *)&daddr;
		daddr6->sin6_family = AF_INET6;
		daddr6->sin6_addr = in6addr_loopback;
		daddr6->sin6_port = htons(PORT);
		break;
	default:
		error(1, 0, "Unsupported family %d", family);
	}

	if (numa_run_on_node(node_id) < 0)
		error(1, errno, "failed to pin to node");

	fd = socket(family, proto, 0);
	if (fd < 0)
		error(1, errno, "failed to create send socket");

	if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)))
		error(1, errno, "failed to bind send socket");

	if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr)))
		error(1, errno, "failed to connect send socket");

	if (send(fd, "a", 1, 0) < 0)
		error(1, errno, "failed to send message");

	close(fd);
}
Example #5
0
  void ConfigureTableThread() {
    int32_t idx = ThreadContext::get_id() - GlobalContext::get_head_table_thread_id();
    int32_t node_id = idx % num_mem_nodes_;
    CHECK_EQ(numa_run_on_node(node_id), 0);

    struct bitmask *mask = numa_allocate_nodemask();
    mask = numa_bitmask_setbit(mask, node_id);

    // set NUMA zone binding to be prefer
    numa_set_bind_policy(0);
    numa_set_membind(mask);
    numa_free_nodemask(mask);
  }
Example #6
0
// worker thread code
void* ThreadFunction( void* p )
{
    struct TArgs * pargs = (struct TArgs*)(p);
    
    // bind to specific NUMA node
    numa_run_on_node( pargs->node );
    
    // create a local random list
    char * rand_list = (char*)mai_malloc(MAJOR_COUNT);
    if( rand_list == NULL )
    {
        printf( "Unable to allocated random list!\n" );
        return NULL;
    }
    memcpy( rand_list, rand_nums + ( MAJOR_COUNT * pargs->no ), MAJOR_COUNT );

    // get the start tick count
    unsigned long long tstart = get_rdtsc_sys();
    
    // repeat the tests
    int repcount = 0;
    for(repcount = 0; repcount < MAJOR_COUNT; repcount++ )
    {
        // allocate random size memory block
        int sizeclass = rand_list[repcount];
        size_t block_size = 1 << sizeclass;
        char * pdata = (char *)mai_malloc( block_size );
        if( pdata == NULL )
        {
            printf( "thread %d failed to allocate %d bytes of memory!\n", pargs->no, block_size );
            continue;
        }
        
        // access the memory
         int q;
        for(q = 0; q < MINOR_COUNT; q++ )
            AccessMemory( pdata, block_size );

        // free the memory
       mai_free( pdata );
    }
    
    // get the end tick count
    unsigned long long tend = get_rdtsc_sys();
    
    // calculate the tick count needed to complete the tests
    pargs->tdiff = tend - tstart;
    
    // free the random list
   mai_free( rand_list );
}
Example #7
0
    void numaLoop(int numaNode, int numberOfThreads)
    {
        m_timers.start(omp_get_max_threads());

#pragma omp parallel num_threads(numberOfThreads)
        {
            // tie thread to a NUMA node
            numa_run_on_node(numaNode);
            numa_set_preferred(numaNode);

            executeThreadWork();
        }   // end omp parallel

        m_timers.stop();

        assert(m_queue.unsafe_size() == 0);
    }
Example #8
0
void INTERNAL qt_affinity_set(qthread_worker_t *me,
                              unsigned int      Q_UNUSED(nw))
{                                      /*{{{ */
    assert(me);

    qthread_shepherd_t *const myshep = me->shepherd;

    /* It would be nice if we could do something more specific than
     * "numa_run_on_node", but because sched_etaffinity() is so dangerous, we
     * really can't, in good conscience. */
    qthread_debug(AFFINITY_FUNCTIONS, "calling numa_run_on_node(%i) for worker %i\n", myshep->node, me->packed_worker_id);
    int ret = numa_run_on_node(myshep->node);
    if (ret != 0) {
        numa_error("setting thread affinity");
	abort();
    }
    numa_set_localalloc();
}                                      /*}}} */
Example #9
0
    void loop()
    {
        m_timers.start(omp_get_max_threads());
        m_idle_count = 0;

#pragma omp parallel
        {
            if (gopt_memory_type == "mmap_node0")
            {
                // tie thread to first NUMA node
                numa_run_on_node(0);
                numa_set_preferred(0);
            }

            executeThreadWork();
        }   // end omp parallel

        m_timers.stop();

        assert(m_queue.unsafe_size() == 0);
    }
Example #10
0
static void set_affinity(pid_t tid, int cpu_id) {
    if(!get_shm()->active)
        return;

    if(!get_shm()->per_node) {
        cpu_set_t mask;
        CPU_ZERO(&mask);
        CPU_SET(cpu_id, &mask);
        VERBOSE("--> Setting tid %d on core %d\n", tid, cpu_id);
        int r = old_sched_setaffinity(tid, sizeof(mask), &mask);
        if (r < 0) {
            fprintf(stderr, "couldn't set affinity on %d\n", cpu_id);
            exit(1);
        }
    } else {
        int r = numa_run_on_node(numa_node_of_cpu(cpu_id));
        if(r < 0) {
            fprintf(stderr, "couldn't set affinity on node of cpu %d\n", cpu_id);
            exit(1);
        }
    }
}
Example #11
0
/* static */
void ThreadPool::setThreadNodeAffinity(int numaNode)
{
#if defined(_WIN32_WINNT) && _WIN32_WINNT >= _WIN32_WINNT_WIN7 
    GROUP_AFFINITY groupAffinity;
    if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity))
    {
        if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask))
            return;
    }
    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
#elif HAVE_LIBNUMA
    if (numa_available() >= 0)
    {
        numa_run_on_node(numaNode);
        numa_set_preferred(numaNode);
        numa_set_localalloc();
        return;
    }
    x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode);
#else
    (void)numaNode;
#endif
}
Example #12
0
void gibbs(dd::CmdParser & cmd_parser){

  // number of NUMA nodes
  int n_numa_node = numa_max_node() + 1;
  // number of max threads per NUMA node
  int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node);

  // get command line arguments
  std::string fg_file = cmd_parser.fg_file->getValue();

  std::string weight_file = cmd_parser.weight_file->getValue();
  std::string variable_file = cmd_parser.variable_file->getValue();
  std::string factor_file = cmd_parser.factor_file->getValue();
  std::string edge_file = cmd_parser.edge_file->getValue();
  std::string meta_file = cmd_parser.meta_file->getValue();

  std::string output_folder = cmd_parser.output_folder->getValue();

  int n_learning_epoch = cmd_parser.n_learning_epoch->getValue();
  int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue();
  int n_inference_epoch = cmd_parser.n_inference_epoch->getValue();

  double stepsize = cmd_parser.stepsize->getValue();
  double stepsize2 = cmd_parser.stepsize2->getValue();
  // hack to support two parameters to specify step size
  if (stepsize == 0.01) stepsize = stepsize2;
  double decay = cmd_parser.decay->getValue();

  int n_datacopy = cmd_parser.n_datacopy->getValue();
  double reg_param = cmd_parser.reg_param->getValue();
  double reg1_param = cmd_parser.reg1_param->getValue();
  bool is_quiet = cmd_parser.quiet->getValue();
  bool sample_evidence = cmd_parser.sample_evidence->getValue();
  int burn_in = cmd_parser.burn_in->getValue();
  bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue();

  Meta meta = read_meta(fg_file); 

  if (is_quiet) {
    std::cout << "Running in quiet mode..." << std::endl;
  } else {
    std::cout << std::endl;
    std::cout << "#################MACHINE CONFIG#################" << std::endl;
    std::cout << "# # NUMA Node        : " << n_numa_node << std::endl;
    std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << std::endl;
    std::cout << "#################GIBBS SAMPLING#################" << std::endl;
    std::cout << "# fg_file            : " << fg_file << std::endl;
    std::cout << "# edge_file          : " << edge_file << std::endl;
    std::cout << "# weight_file        : " << weight_file << std::endl;
    std::cout << "# variable_file      : " << variable_file << std::endl;
    std::cout << "# factor_file        : " << factor_file << std::endl;
    std::cout << "# meta_file          : " << meta_file << std::endl;
    std::cout << "# output_folder      : " << output_folder << std::endl;
    std::cout << "# n_learning_epoch   : " << n_learning_epoch << std::endl;
    std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl;
    std::cout << "# n_inference_epoch  : " << n_inference_epoch << std::endl;
    std::cout << "# stepsize           : " << stepsize << std::endl;
    std::cout << "# decay              : " << decay << std::endl;
    std::cout << "# regularization     : " << reg_param << std::endl;
    std::cout << "# l1 regularization     : " << reg1_param << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl;
    std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl;
    std::cout << "################################################" << std::endl;


    std::cout << "# nvar               : " << meta.num_variables << std::endl;
    std::cout << "# nfac               : " << meta.num_factors << std::endl;
    std::cout << "# nweight            : " << meta.num_weights << std::endl;
    std::cout << "# nedge              : " << meta.num_edges << std::endl;
    std::cout << "################################################" << std::endl;
  }

  // run on NUMA node 0
  numa_run_on_node(0);
  numa_set_localalloc();

  // load factor graph
  dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges);
  fg.load(cmd_parser, is_quiet);
  dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence);

  // number of learning epochs
  // the factor graph is copied on each NUMA node, so the total epochs =
  // epochs specified / number of NUMA nodes
  int numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) + 
                            (n_learning_epoch%n_numa_node==0?0:1);

  // learning
  /*gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
    stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/
  gibbs.learn(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
              stepsize, decay, reg_param, reg1_param, is_quiet);

  // dump weights
  gibbs.dump_weights(is_quiet);

  // number of inference epochs
  int numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) + 
                            (n_inference_epoch%n_numa_node==0?0:1);

  // inference
  gibbs.inference(numa_aware_n_epoch, is_quiet);
  gibbs.aggregate_results_and_dump(is_quiet);


  // print weights from inference result
  for(long t=0;t<fg.n_weight;t++) {
      std::cout<<fg.infrs->weight_values[t]<<std::endl;
  }

  // print weights from factor graph
  std::cout<<"PRINTING WEIGHTS FROM WEIGHT VARIABLE"<<std::endl;
  for(long t=0;t<fg.n_weight;t++) {
      std::cout<<fg.weights[t].weight<<std::endl;
  }
}
Example #13
0
void em(dd::CmdParser & cmd_parser){

  // number of NUMA nodes
  int n_numa_node = numa_max_node() + 1;
  // number of max threads per NUMA node
  int n_thread_per_numa = (sysconf(_SC_NPROCESSORS_CONF))/(n_numa_node);

  // get command line arguments
  std::string fg_file = cmd_parser.fg_file->getValue();

  std::string weight_file = cmd_parser.weight_file->getValue();
  std::string variable_file = cmd_parser.variable_file->getValue();
  std::string factor_file = cmd_parser.factor_file->getValue();
  std::string edge_file = cmd_parser.edge_file->getValue();
  std::string meta_file = cmd_parser.meta_file->getValue();

  std::string output_folder = cmd_parser.output_folder->getValue();

  int n_learning_epoch = cmd_parser.n_learning_epoch->getValue();
  int n_samples_per_learning_epoch = cmd_parser.n_samples_per_learning_epoch->getValue();
  int n_inference_epoch = cmd_parser.n_inference_epoch->getValue();

  double stepsize = cmd_parser.stepsize->getValue();
  double stepsize2 = cmd_parser.stepsize2->getValue();
  // hack to support two parameters to specify step size
  if (stepsize == 0.01) stepsize = stepsize2;
  double decay = cmd_parser.decay->getValue();

  int n_datacopy = cmd_parser.n_datacopy->getValue();
  double reg_param = cmd_parser.reg_param->getValue();
  double reg1_param = cmd_parser.reg1_param->getValue();
  bool is_quiet = cmd_parser.quiet->getValue();
  bool check_convergence = cmd_parser.check_convergence->getValue();
  bool sample_evidence = cmd_parser.sample_evidence->getValue();
  int burn_in = cmd_parser.burn_in->getValue();
  int n_iter = cmd_parser.n_iter->getValue();
  int wl_conv = cmd_parser.wl_conv->getValue();
  int delta = cmd_parser.delta->getValue();
  bool learn_non_evidence = cmd_parser.learn_non_evidence->getValue();

  Meta meta = read_meta(fg_file);

  if (is_quiet) {
    std::cout << "Running in quiet mode..." << std::endl;
  } else {
    std::cout << std::endl;
    std::cout << "#################MACHINE CONFIG#################" << std::endl;
    std::cout << "# # NUMA Node        : " << n_numa_node << std::endl;
    std::cout << "# # Thread/NUMA Node : " << n_thread_per_numa << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << std::endl;
    std::cout << "#################GIBBS SAMPLING#################" << std::endl;
    std::cout << "# fg_file            : " << fg_file << std::endl;
    std::cout << "# edge_file          : " << edge_file << std::endl;
    std::cout << "# weight_file        : " << weight_file << std::endl;
    std::cout << "# variable_file      : " << variable_file << std::endl;
    std::cout << "# factor_file        : " << factor_file << std::endl;
    std::cout << "# meta_file          : " << meta_file << std::endl;
    std::cout << "# output_folder      : " << output_folder << std::endl;
    std::cout << "# n_learning_epoch   : " << n_learning_epoch << std::endl;
    std::cout << "# n_samples/l. epoch : " << n_samples_per_learning_epoch << std::endl;
    std::cout << "# n_inference_epoch  : " << n_inference_epoch << std::endl;
    std::cout << "# stepsize           : " << stepsize << std::endl;
    std::cout << "# decay              : " << decay << std::endl;
    std::cout << "# regularization     : " << reg_param << std::endl;
    std::cout << "# l1 regularization     : " << reg1_param << std::endl;
    std::cout << "################################################" << std::endl;
    std::cout << "# IGNORE -s (n_samples/l. epoch). ALWAYS -s 1. #" << std::endl;
    std::cout << "# IGNORE -t (threads). ALWAYS USE ALL THREADS. #" << std::endl;
    std::cout << "################################################" << std::endl;


    std::cout << "# nvar               : " << meta.num_variables << std::endl;
    std::cout << "# nfac               : " << meta.num_factors << std::endl;
    std::cout << "# nweight            : " << meta.num_weights << std::endl;
    std::cout << "# nedge              : " << meta.num_edges << std::endl;
    std::cout << "################################################" << std::endl;
  }

  // run on NUMA node 0
  numa_run_on_node(0);
  numa_set_localalloc();

  // load factor graph
  dd::FactorGraph fg(meta.num_variables, meta.num_factors, meta.num_weights, meta.num_edges);
  fg.load(cmd_parser, is_quiet);
  dd::GibbsSampling gibbs(&fg, &cmd_parser, n_datacopy, sample_evidence, burn_in, learn_non_evidence);

  // Initialize EM instance
  dd::ExpMax expMax(&fg, &gibbs, wl_conv, delta, check_convergence);

  // number of inference epochs
  int numa_aware_n_epoch;
  int numa_aware_n_learning_epoch;

  // EM init -- run Maximzation (semi-supervised learning)

  // Maximization step
  numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) +
                                (n_learning_epoch%n_numa_node==0?0:1);
  expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                      stepsize, decay, reg_param, reg1_param, is_quiet);
  /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                      stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/

  while (!expMax.hasConverged && n_iter > 0) {

    // Expectation step
    numa_aware_n_epoch = (int)(n_inference_epoch/n_numa_node) +
                         (n_inference_epoch%n_numa_node==0?0:1);
    expMax.expectation(numa_aware_n_epoch,is_quiet);


    // Maximization step
    numa_aware_n_learning_epoch = (int)(n_learning_epoch/n_numa_node) +
                                  (n_learning_epoch%n_numa_node==0?0:1);
    /*expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                        stepsize, decay, reg_param, reg1_param, meta_file, is_quiet);*/
    expMax.maximization(numa_aware_n_learning_epoch, n_samples_per_learning_epoch,
                        stepsize, decay, reg_param, reg1_param, is_quiet);

    //Decrement iteration counter
    n_iter--;
  }

  expMax.dump_weights(is_quiet);
  expMax.aggregate_results_and_dump(is_quiet);


}
Example #14
0
/*
 * Class:     xerial_jnuma_NumaNative
 * Method:    runOnNode
 * Signature: (I)V
 */
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_runOnNode
    (JNIEnv *env, jobject obj, jint node) {
  numa_run_on_node((int) node);
}
Example #15
0
void initQuda(int dev)
{
  static int initialized = 0;
  if (initialized) {
    return;
  }
  initialized = 1;

#if (CUDA_VERSION >= 4000) && defined(MULTI_GPU)
  //check if CUDA_NIC_INTEROP is set to 1 in the enviroment
  char* cni_str = getenv("CUDA_NIC_INTEROP");
  if(cni_str == NULL){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set\n");
  }
  int cni_int = atoi(cni_str);
  if (cni_int != 1){
    errorQuda("Environment variable CUDA_NIC_INTEROP is not set to 1\n");    
  }
#endif

  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  if (deviceCount == 0) {
    errorQuda("No devices supporting CUDA");
  }

  for(int i=0; i<deviceCount; i++) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, i);
    printfQuda("QUDA: Found device %d: %s\n", i, deviceProp.name);
  }

#ifdef QMP_COMMS
  int ndim;
  const int *dim;

  if ( QMP_is_initialized() != QMP_TRUE ) {
    errorQuda("QMP is not initialized");
  }
  num_QMP=QMP_get_number_of_nodes();
  rank_QMP=QMP_get_node_number();
  
  dev += rank_QMP % deviceCount;
  ndim = QMP_get_logical_number_of_dimensions();
  dim = QMP_get_logical_dimensions();

#elif defined(MPI_COMMS)

  comm_init();
  dev=comm_gpuid();

#else
  if (dev < 0) dev = deviceCount - 1;
#endif
  
  // Used for applying the gauge field boundary condition
  if( commCoords(3) == 0 ) qudaPt0=true;
  else qudaPt0=false;

  if( commCoords(3) == commDim(3)-1 ) qudaPtNm1=true;
  else qudaPtNm1=false;

  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, dev);
  if (deviceProp.major < 1) {
    errorQuda("Device %d does not support CUDA", dev);
  }

  
  printfQuda("QUDA: Using device %d: %s\n", dev, deviceProp.name);

  cudaSetDevice(dev);
#ifdef HAVE_NUMA
  if(numa_config_set){
    if(gpu_affinity[dev] >=0){
      printfQuda("Numa setting to cpu node %d\n", gpu_affinity[dev]);
      if(numa_run_on_node(gpu_affinity[dev]) != 0){
        printfQuda("Warning: Setting numa to cpu node %d failed\n", gpu_affinity[dev]);
      }
    }

  }
#endif

  initCache();
  quda::initBlas();
}
Example #16
0
/*
 * Class:     xerial_jnuma_NumaNative
 * Method:    runOnNode
 * Signature: (I)V
 */
JNIEXPORT void JNICALL Java_xerial_jnuma_NumaNative_runOnNode
(JNIEnv *env, jobject obj, jint node) {
    int ret = numa_run_on_node((int) node);
    if(ret != 0)
        throwException(env, obj, errno);
}
Example #17
0
  void TaskManager :: Loop(int thd)
  {
    /*
    static Timer tADD("add entry counter");
    static Timer tCASready1("spin-CAS ready tick1");
    static Timer tCASready2("spin-CAS ready tick2");
    static Timer tCASyield("spin-CAS yield");
    static Timer tCAS1("spin-CAS wait");
    static Timer texit("exit zone");
    static Timer tdec("decrement");
    */
    thread_id = thd;

    int thds = GetNumThreads();

    int mynode = num_nodes * thd/thds;

    NodeData & mynode_data = *(nodedata[mynode]);



    TaskInfo ti;
    ti.nthreads = thds;
    ti.thread_nr = thd;
    // ti.nnodes = num_nodes;
    // ti.node_nr = mynode;

      
#ifdef USE_NUMA
    numa_run_on_node (mynode);
#endif
    active_workers++;
    workers_on_node[mynode]++;
    int jobdone = 0;


#ifdef USE_MKL
    auto mkl_max = mkl_get_max_threads();
    mkl_set_num_threads_local(1);
#endif

    
    while (!done)
      {
        if (complete[mynode] > jobdone)
          jobdone = complete[mynode];

        if (jobnr == jobdone)
          {
            // RegionTracer t(ti.thread_nr, tCASyield, ti.task_nr);            
            if(sleep)
              this_thread::sleep_for(chrono::microseconds(sleep_usecs));
            else
              {
#ifdef WIN32
                this_thread::yield();
#else  // WIN32
                sched_yield();
#endif // WIN32
              }
            continue;
          }

        {
          // RegionTracer t(ti.thread_nr, tADD, ti.task_nr);

          // non-atomic fast check ...
          if ( (mynode_data.participate & 1) == 0) continue;

          int oldval = mynode_data.participate += 2;
          if ( (oldval & 1) == 0)
            { // job not active, going out again
              mynode_data.participate -= 2;
              continue;
            }
        }

        if (startup_function) (*startup_function)();
        
        IntRange mytasks = Range(int(ntasks)).Split (mynode, num_nodes);
          
        try
          {
            
            while (1)
              {
                if (mynode_data.start_cnt >= mytasks.Size()) break;
		int mytask = mynode_data.start_cnt.fetch_add(1, memory_order_relaxed);
                if (mytask >= mytasks.Size()) break;
                
                ti.task_nr = mytasks.First()+mytask;
                ti.ntasks = ntasks;
                
                {
                  RegionTracer t(ti.thread_nr, jobnr, RegionTracer::ID_JOB, ti.task_nr);
                  (*func)(ti);
                }
              }

          }
        catch (Exception e)
          {
            {
              // cout << "got exception in TM" << endl; 
              lock_guard<mutex> guard(copyex_mutex);
              delete ex;
              ex = new Exception (e);
              mynode_data.start_cnt = mytasks.Size();
            }
          }

#ifndef __MIC__
        atomic_thread_fence (memory_order_release);     
#endif // __MIC__

        if (cleanup_function) (*cleanup_function)();

        jobdone = jobnr;

        mynode_data.participate-=2;

	{
	  int oldpart = 1;
	  if (mynode_data.participate.compare_exchange_strong (oldpart, 0))
	    {
              if (jobdone < jobnr.load())
                { // reopen gate
                  mynode_data.participate |= 1;                  
                }
              else
                {
                  if (mynode != 0)
                    mynode_data.start_cnt = 0;
                  complete[mynode] = jobnr.load(); 
                }
	    }	      
	}
      }
    

#ifdef USE_MKL
    mkl_set_num_threads_local(mkl_max);
#endif

    workers_on_node[mynode]--;
    active_workers--;
  }