Exemple #1
0
static void check(unsigned nbnodes, unsigned nbcores, unsigned nbpus)
{
  unsigned depth;
  unsigned nb;
  unsigned long long total_memory;

  /* sanity checks */
  depth = hwloc_topology_get_depth(topology);
  assert(depth == 4);
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE);
  assert(depth == 1);
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE);
  assert(depth == 2);
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
  assert(depth == 3);

  /* actual checks */
  nb = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE);
  assert(nb == nbnodes);
  nb = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
  assert(nb == nbcores);
  nb = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU);
  assert(nb == nbpus);
  total_memory = hwloc_get_root_obj(topology)->memory.total_memory;
  assert(total_memory == nbnodes * 1024*1024*1024); /* synthetic topology puts 1GB per node */
}
Exemple #2
0
int main(void)
{
  hwloc_topology_t topology;
  hwloc_obj_t obj;

  hwloc_topology_init(&topology);
  hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL);
  hwloc_topology_load(topology);

  printf("Found %d bridges\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_BRIDGE));
  obj = NULL;
  while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_BRIDGE);
    /* only host->pci and pci->pci bridge supported so far */
    if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST) {
      assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI);
      printf(" Found host->PCI bridge for domain %04x bus %02x-%02x\n",
	     obj->attr->bridge.downstream.pci.domain,
	     obj->attr->bridge.downstream.pci.secondary_bus,
	     obj->attr->bridge.downstream.pci.subordinate_bus);
    } else {
      assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
      assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI);
      printf(" Found PCI->PCI bridge [%04x:%04x] for domain %04x bus %02x-%02x\n",
	     obj->attr->bridge.upstream.pci.vendor_id,
	     obj->attr->bridge.upstream.pci.device_id,
	     obj->attr->bridge.downstream.pci.domain,
	     obj->attr->bridge.downstream.pci.secondary_bus,
	     obj->attr->bridge.downstream.pci.subordinate_bus);
    }
  }

  printf("Found %d PCI devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE));
  obj = NULL;
  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
    printf(" Found PCI device class %04x vendor %04x model %04x\n",
	   obj->attr->pcidev.class_id, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
  }

  printf("Found %d OS devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_OS_DEVICE));
  obj = NULL;
  while ((obj = hwloc_get_next_osdev(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_OS_DEVICE);
    printf(" Found OS device %s subtype %d\n", obj->name, obj->attr->osdev.type);
  }

  assert(HWLOC_TYPE_DEPTH_BRIDGE == hwloc_get_type_depth(topology, HWLOC_OBJ_BRIDGE));
  assert(HWLOC_TYPE_DEPTH_PCI_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_PCI_DEVICE));
  assert(HWLOC_TYPE_DEPTH_OS_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_OS_DEVICE));
  assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_PCI_DEVICE) < 0);
  assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_OS_DEVICE) < 0);
  assert(hwloc_compare_types(HWLOC_OBJ_PCI_DEVICE, HWLOC_OBJ_OS_DEVICE) < 0);

  hwloc_topology_destroy(topology);

  return 0;
}
Exemple #3
0
int
hwloc_type_sscanf_as_depth(const char *string, hwloc_obj_type_t *typep,
			   hwloc_topology_t topology, int *depthp)
{
  union hwloc_obj_attr_u attr;
  hwloc_obj_type_t type;
  int depth;
  int err;

  err = hwloc_type_sscanf(string, &type, &attr, sizeof(attr));
  if (err < 0)
    return err;

  depth = hwloc_get_type_depth(topology, type);
  if (type == HWLOC_OBJ_GROUP
      && depth == HWLOC_TYPE_DEPTH_MULTIPLE
      && attr.group.depth != (unsigned)-1) {
    unsigned l;
    depth = HWLOC_TYPE_DEPTH_UNKNOWN;
    for(l=0; l<topology->nb_levels; l++) {
      if (topology->levels[l][0]->type == HWLOC_OBJ_GROUP
	  && topology->levels[l][0]->attr->group.depth == attr.group.depth) {
	depth = l;
	break;
      }
    }
  }

  if (typep)
    *typep = type;
  *depthp = (unsigned) depth;
  return 0;
}
static int
hwloc_solaris_get_sth_membind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
{
  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  int n;
  int i;

  if (depth < 0) {
    errno = ENOSYS;
    return -1;
  }

  hwloc_bitmap_zero(nodeset);
  n = hwloc_get_nbobjs_by_depth(topology, depth);

  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index);

    if (aff == LGRP_AFF_STRONG)
      hwloc_bitmap_set(nodeset, obj->os_index);
  }

  if (hwloc_bitmap_iszero(nodeset))
    hwloc_bitmap_copy(nodeset, hwloc_topology_get_complete_nodeset(topology));

  *policy = HWLOC_MEMBIND_BIND;
  return 0;
}
Exemple #5
0
static void
output_compute_pu_min_textwidth(struct lstopo_output *output)
{
  unsigned fontsize = output->fontsize;
  char text[64];
  int n;
  hwloc_topology_t topology = output->topology;
  hwloc_obj_t lastpu;

  if (!output->methods->textsize) {
    output->min_pu_textwidth = 0;
    return;
  }

  if (output->logical) {
    int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
    lastpu = hwloc_get_obj_by_depth(topology, depth, hwloc_get_nbobjs_by_depth(topology, depth)-1);
  } else {
    unsigned lastidx = hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(topology));
    lastpu = hwloc_get_pu_obj_by_os_index(topology, lastidx);
  }

  n = lstopo_obj_snprintf(output, text, sizeof(text), lastpu);
  output->min_pu_textwidth = get_textwidth(output, text, n, fontsize);
}
/*
 * Distribute cpus to the task using block distribution
 */
static int _task_cgroup_cpuset_dist_block(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, uint32_t nobj,
	slurmd_job_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t i, pfirst,plast;
	uint32_t taskid = job->envtp->localid;
	int hwdepth;

	if (bind_verbose)
		info("task/cgroup: task[%u] using block distribution, "
		     "task_dist %u", taskid, job->task_dist);
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		pfirst = taskid *  job->cpus_per_task ;
		plast = pfirst + job->cpus_per_task - 1;
	} else {
		/* sockets or ldoms granularity */
		pfirst = taskid;
		plast = pfirst;
	}
	hwdepth = hwloc_get_type_depth(topology,hwtype);
	for (i = pfirst; i <= plast && i < nobj ; i++) {
		obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
		_add_cpuset(hwtype, req_hwtype, obj, taskid, bind_verbose,
			    cpuset);
	}
	return XCGROUP_SUCCESS;
}
    int PlatformTopology::num_domain(int domain_type) const
    {
        int result = 0;

        try {
            result = hwloc_get_nbobjs_by_type(m_topo, hwloc_domain(domain_type));
        }
        catch (const Exception &ex) {
            if (ex.err_value() != GEOPM_ERROR_INVALID) {
                throw ex;
            }
            if (domain_type == GEOPM_DOMAIN_TILE) {
                /// @todo  This assumes that tiles are just below
                ///        package in hwloc hierarchy.  If tiles are
                ///        at L2 cache, but processor has an L3 cache,
                ///        this may not be correct.
                int depth = hwloc_get_type_depth(m_topo, hwloc_domain(GEOPM_DOMAIN_PACKAGE)) + 1;
                result = hwloc_get_nbobjs_by_depth(m_topo, depth);
            }
            else {
                throw ex;
            }
            if (result == 0) {
                throw ex;
            }
        }
        return result;
    }
static int
hwloc_solaris_get_sth_cpubind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
{
  processorid_t binding;
  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  int n;
  int i;

  if (depth < 0) {
    errno = ENOSYS;
    return -1;
  }

  /* first check if processor_bind() was used to bind to a single processor rather than to an lgroup */
  if ( processor_bind(idtype, id, PBIND_QUERY, &binding) == 0 && binding != PBIND_NONE ) {
    hwloc_bitmap_only(hwloc_set, binding);
    return 0;
  }

  /* if not, check lgroups */
  hwloc_bitmap_zero(hwloc_set);
  n = hwloc_get_nbobjs_by_depth(topology, depth);
  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index);

    if (aff == LGRP_AFF_STRONG)
      hwloc_bitmap_or(hwloc_set, hwloc_set, obj->cpuset);
  }

  if (hwloc_bitmap_iszero(hwloc_set))
    hwloc_bitmap_copy(hwloc_set, hwloc_topology_get_complete_cpuset(topology));

  return 0;
}
int
hwloc_get_obj_depth_by_name(hwloc_topology_t topology, char * obj_name){
  /* find hwloc obj depth */
  hwloc_obj_type_t type;
  int depthattrp;
  hwloc_obj_cache_type_t cache_type;
  if(hwloc_obj_type_sscanf(obj_name,&type,&depthattrp,&cache_type,sizeof(cache_type))==-1){
    fprintf(stderr,"type \"%s\" was not recognized\n",obj_name);
    return -1;
  }
  int depth = hwloc_get_type_depth(topology,type);
  if(depth==HWLOC_TYPE_DEPTH_MULTIPLE){
    if(type==HWLOC_OBJ_CACHE){
      depth = hwloc_get_cache_type_depth(topology,depthattrp,cache_type);
      if(depth == HWLOC_TYPE_DEPTH_UNKNOWN){
	fprintf(stderr,"type %s cannot be found, level=%d\n",obj_name,depthattrp);
	return -1;
      }
      if(depth == HWLOC_TYPE_DEPTH_MULTIPLE){
	fprintf(stderr,"type %s multiple caches match for\n",obj_name);
	return -1;
      }
    }
    else{
      fprintf(stderr,"type \"%s\" isn't handled...\n",obj_name);
      return -1;
    }
  }
  return depth;
}
int compute_context_nbr(int * ctxnbr, int threadnbr, int verbose)
{
#if (defined WITH_STARPU && defined STARPU_CONTEXT)
  if (*ctxnbr == -1)
    {
      int depth;
      unsigned i, n;
      int ncpu_per_socket, nsocket, ncpu;
      hwloc_topology_t topology;
      hwloc_obj_t obj;
      /* Allocate and initialize topology object. */
      hwloc_topology_init(&topology);
      /* ... Optionally, put detection configuration here to ignore
         some objects types, define a synthetic topology, etc....
         The default is to detect all the objects of the machine that
         the caller is allowed to access.  See Configure Topology
         Detection. */
      /* Perform the topology detection. */
      hwloc_topology_load(topology);

      depth = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET);
      if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
        /* number of socket is unknow, let say we have quadcore... */
        ncpu_per_socket = 4;
      } else {
        ncpu    = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE);
        nsocket = hwloc_get_nbobjs_by_depth(topology, depth);
        ncpu_per_socket = ncpu/nsocket;
      }
      if (verbose > API_VERBOSE_NO)
        fprintf(stdout, "ncpu_per_socket %d\n", ncpu_per_socket);
      nsocket = threadnbr/ncpu_per_socket;
      if (threadnbr%ncpu_per_socket)
        nsocket ++;
      *ctxnbr = nsocket + 1;

      hwloc_topology_destroy(topology);
    }
  if (threadnbr + 1 < *ctxnbr)
    *ctxnbr = threadnbr+1;

  if (*ctxnbr == 1)
    *ctxnbr = 2;

  /* can't have more than STARPU_NMAX_SCHED_CTXS CTX */
  {
    PASTIX_INT nctx_bot = *ctxnbr - 1;
    while (*ctxnbr > STARPU_NMAX_SCHED_CTXS)
      {
        nctx_bot /= 2;
        *ctxnbr = nctx_bot + 1;
      }
  }
#endif /* WITH_STARPU */
  return 0;
}
magma_int_t magma_get_parallel_numthreads()
{
    // query number of cores
    magma_int_t ncores = 0;

#ifdef HAVE_HWLOC
    // hwloc gives physical cores, not hyperthreads
    // from http://stackoverflow.com/questions/12483399/getting-number-of-cores-not-ht-threads
    hwloc_topology_t topology;
    hwloc_topology_init( &topology );
    hwloc_topology_load( topology );
    magma_int_t depth = hwloc_get_type_depth( topology, HWLOC_OBJ_CORE );
    if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) {
        ncores = hwloc_get_nbobjs_by_depth( topology, depth );
    }
    hwloc_topology_destroy( topology );
#endif

    if ( ncores == 0 ) {
        #ifdef _MSC_VER  // Windows
        SYSTEM_INFO sysinfo;
        GetSystemInfo( &sysinfo );
        ncores = sysinfo.dwNumberOfProcessors;
        #else
        ncores = sysconf( _SC_NPROCESSORS_ONLN );
        #endif
    }

    // query MAGMA_NUM_THREADS or OpenMP
    const char *threads_str = getenv("MAGMA_NUM_THREADS");
    magma_int_t threads = 0;
    if ( threads_str != NULL ) {
        char* endptr;
        threads = strtol( threads_str, &endptr, 10 );
        if ( threads < 1 || *endptr != '\0' ) {
            threads = 1;
            fprintf( stderr, "$MAGMA_NUM_THREADS='%s' is an invalid number; using %d thread.\n",
                     threads_str, (int) threads );
        }
    }
    else {
        #if defined(_OPENMP)
        #pragma omp parallel
        {
            threads = omp_get_num_threads();
        }
        #else
            threads = ncores;
        #endif
    }

    // limit to range [1, number of cores]
    threads = max( 1, min( ncores, threads ));
    return threads;
}
void 
check_map_event_obj(hwloc_topology_t topology, char * obj_name, char * event_name)
{
  PAPI_event_info_t info;
  int err, eventcode;
  hwloc_obj_t obj;
  unsigned depth = hwloc_get_obj_depth_by_name(topology,obj_name);
  if((err = PAPI_event_name_to_code(event_name,&eventcode))!=PAPI_OK){
    handle_error(err);
    fprintf(stderr,"could not get \"%s\" event infos\n",event_name);
  }
  
  PAPI_get_event_info(eventcode,&info);
  switch(info.location){
  case PAPI_LOCATION_UNCORE:
      if(depth >= hwloc_get_type_depth(topology, HWLOC_OBJ_CORE)){
	fprintf(stderr,"event %s is an UNCORE event and is actually mapped on %s which is under HWLOC_OBJ_CORE\n",
		event_name, obj_name);
      }
      break;      
    case PAPI_LOCATION_CPU:
      if(depth > hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE)){
	fprintf(stderr,"event %s is a CPU event and is actually mapped on %s which is strictly under HWLOC_OBJ_MACHINE\n",
		event_name, obj_name);
      }
      break;
    case PAPI_LOCATION_PACKAGE:
      if(depth > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)){
	fprintf(stderr,"event %s is a PACKAGE event and is actually mapped on %s which is strictly under HWLOC_OBJ_SOCKET\n",
		event_name, obj_name);
      }
      break;
    default:
      break;
    }
}
static int _get_ldom_sched_cpuset(hwloc_topology_t topology,
		hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
		uint32_t ldom, cpu_set_t *mask)
{
	hwloc_obj_t obj;
	hwloc_bitmap_t cpuset;
	int hwdepth;

	cpuset = hwloc_bitmap_alloc();
	hwdepth = hwloc_get_type_depth(topology, hwtype);
	obj = hwloc_get_obj_by_depth(topology, hwdepth, ldom);
	_add_hwloc_cpuset(hwtype, req_hwtype, obj, 0, 0, cpuset);
	hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset,
						 mask, sizeof(cpu_set_t));
	hwloc_bitmap_free(cpuset);
	return true;
}
int get_max_objs_inside_cpuset_by_type(hwloc_topology_t topology, hwloc_cpuset_t cpuset, hwloc_obj_type_t type){
    int depth = hwloc_get_type_depth(topology, type);
    if(depth == HWLOC_TYPE_DEPTH_UNKNOWN){
	fprintf(stderr, "Cannot find depth %s\n", hwloc_type_name(type));
	return -1;
    }
    if(depth == HWLOC_TYPE_DEPTH_MULTIPLE){
	hwloc_obj_t deepest_of_type = hwloc_get_obj_inside_cpuset_by_type(topology, cpuset, HWLOC_OBJ_PU,0);
	while(deepest_of_type !=NULL && deepest_of_type->type != type)
	    deepest_of_type = deepest_of_type->parent;
	if(deepest_of_type == NULL)
	    return -1;
	else 
	    depth = deepest_of_type->depth;
    }
    return hwloc_get_nbobjs_inside_cpuset_by_depth(topology, cpuset, depth);
}
Exemple #15
0
static int
hwloc_aix_get_sth_membind(hwloc_topology_t topology, rstype_t what, rsid_t who, hwloc_bitmap_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
{
  hwloc_bitmap_t hwloc_set;
  rsethandle_t rset;
  unsigned cpu, maxcpus;
  int res = -1;
  int depth, n, i;

  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  if (depth < 0) {
    errno = EXDEV;
    return -1;
  }
  n = hwloc_get_nbobjs_by_depth(topology, depth);

  rset = rs_alloc(RS_EMPTY);

  if (ra_getrset(what, who, 0, rset) == -1)
    goto out;

  hwloc_set = hwloc_bitmap_alloc();

  maxcpus = rs_getinfo(rset, R_MAXPROCS, 0);
  for (cpu = 0; cpu < maxcpus; cpu++)
    if (rs_op(RS_TESTRESOURCE, rset, NULL, R_PROCS, cpu) == 1)
      hwloc_bitmap_set(hwloc_set, cpu);
  hwloc_bitmap_and(hwloc_set, hwloc_set, hwloc_topology_get_complete_cpuset(topology));

  hwloc_bitmap_zero(nodeset);
  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    if (hwloc_bitmap_isincluded(obj->cpuset, hwloc_set))
      hwloc_bitmap_set(nodeset, obj->os_index);
  }

  hwloc_bitmap_free(hwloc_set);

  *policy = HWLOC_MEMBIND_BIND;
  res = 0;

out:
  rs_free(rset);
  return res;
}
Exemple #16
0
static int
dss_topo_init()
{
	hwloc_topology_init(&dss_topo);
	hwloc_topology_load(dss_topo);

	dss_core_depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_CORE);
	dss_core_nr = hwloc_get_nbobjs_by_type(dss_topo, HWLOC_OBJ_CORE);
	dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads);

	if (dss_core_offset < 0 || dss_core_offset >= dss_core_nr) {
		D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
			"should within range [0, %d]", dss_core_offset,
			dss_core_nr - 1);
		return -DER_INVAL;
	}

	return 0;
}
Exemple #17
0
static int
hwloc_solaris_set_sth_membind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
{
  int depth;
  int n, i;

  switch (policy) {
    case HWLOC_MEMBIND_DEFAULT:
    case HWLOC_MEMBIND_BIND:
      break;
    default:
      errno = ENOSYS;
      return -1;
  }

  if (flags & HWLOC_MEMBIND_NOCPUBIND) {
    errno = ENOSYS;
    return -1;
  }

  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  if (depth < 0) {
    errno = EXDEV;
    return -1;
  }
  n = hwloc_get_nbobjs_by_depth(topology, depth);

  for (i = 0; i < n; i++) {
    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i);
    if (hwloc_bitmap_isset(nodeset, obj->os_index)) {
      lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_STRONG);
    } else {
      if (flags & HWLOC_CPUBIND_STRICT)
	lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_NONE);
      else
	lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_WEAK);
    }
  }

  return 0;
}
Exemple #18
0
void Hwloc::getNumSockets(unsigned int &allowedNodes, int &numSockets, unsigned int &hwThreads) {
#ifdef HWLOC
   numSockets = 0;
   // Nodes that can be seen by hwloc
   allowedNodes = 0;
   // Hardware threads
   hwThreads = 0;

   int depth = hwloc_get_type_depth( _hwlocTopology, HWLOC_OBJ_NODE );
   // If there are NUMA nodes in this machine
   if ( depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
      //hwloc_const_cpuset_t cpuset = hwloc_topology_get_online_cpuset( _hwlocTopology );
      //allowedNodes = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, cpuset, HWLOC_OBJ_NODE );
      //hwThreads = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, cpuset, HWLOC_OBJ_PU );
      unsigned nodes = hwloc_get_nbobjs_by_depth( _hwlocTopology, depth );
      //hwloc_cpuset_t set = i

      // For each node, count how many hardware threads there are below.
      for ( unsigned nodeIdx = 0; nodeIdx < nodes; ++nodeIdx )
      {
         hwloc_obj_t node = hwloc_get_obj_by_depth( _hwlocTopology, depth, nodeIdx );
         int localThreads = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, node->cpuset, HWLOC_OBJ_PU );
         // Increase hw thread count
         hwThreads += localThreads;
         // If this node has hw threads beneath, increase the number of viewable nodes
         if ( localThreads > 0 ) ++allowedNodes;
      }
      numSockets = nodes;
   }
   // Otherwise, set it to 1
   else {
      allowedNodes = 1; 
      numSockets = 1;
   }
#else
   numSockets = 0;
   allowedNodes = 0;
#endif
}
Exemple #19
0
int main(void)
{
  hwloc_topology_t topology;
  hwloc_obj_t obj;
  unsigned indexes[5];
  float distances[5*5];
  unsigned depth;
  unsigned width;

  /* group 3 numa nodes as 1 group of 2 and 1 on the side */
  hwloc_topology_init(&topology);
  hwloc_topology_set_synthetic(topology, "node:3 pu:1");
  indexes[0] = 0; indexes[1] = 1; indexes[2] = 2;
  distances[0] = 1; distances[1] = 4; distances[2] = 4;
  distances[3] = 4; distances[4] = 1; distances[5] = 2;
  distances[6] = 4; distances[7] = 2; distances[8] = 1;
  hwloc_topology_set_distance_matrix(topology, HWLOC_OBJ_PU, 3, indexes, distances);
  hwloc_topology_load(topology);
  /* 2 groups at depth 1 */
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP);
  assert(depth == 1);
  width = hwloc_get_nbobjs_by_depth(topology, depth);
  assert(width == 1);
  /* 3 node at depth 2 */
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  assert(depth == 2);
  width = hwloc_get_nbobjs_by_depth(topology, depth);
  assert(width == 3);
  /* find the root obj */
  obj = hwloc_get_root_obj(topology);
  assert(obj->arity == 2);
  /* check its children */
  assert(obj->children[0]->type == HWLOC_OBJ_NUMANODE);
  assert(obj->children[0]->depth == 2);
  assert(obj->children[0]->arity == 1);
  assert(obj->children[1]->type == HWLOC_OBJ_GROUP);
  assert(obj->children[1]->depth == 1);
  assert(obj->children[1]->arity == 2);
  hwloc_topology_destroy(topology);

  /* group 5 packages as 2 group of 2 and 1 on the side, all of them below a common node object */
  hwloc_topology_init(&topology);
  hwloc_topology_set_synthetic(topology, "node:1 pack:5 pu:1");
  indexes[0] = 0; indexes[1] = 1; indexes[2] = 2; indexes[3] = 3; indexes[4] = 4;
  distances[ 0] = 1; distances[ 1] = 2; distances[ 2] = 4; distances[ 3] = 4; distances[ 4] = 4;
  distances[ 5] = 2; distances[ 6] = 1; distances[ 7] = 4; distances[ 8] = 4; distances[ 9] = 4;
  distances[10] = 4; distances[11] = 4; distances[12] = 1; distances[13] = 4; distances[14] = 4;
  distances[15] = 4; distances[16] = 4; distances[17] = 4; distances[18] = 1; distances[19] = 2;
  distances[20] = 4; distances[21] = 4; distances[22] = 4; distances[23] = 2; distances[24] = 1;
  hwloc_topology_set_distance_matrix(topology, HWLOC_OBJ_PACKAGE, 5, indexes, distances);
  hwloc_topology_load(topology);
  /* 1 node at depth 1 */
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
  assert(depth == 1);
  width = hwloc_get_nbobjs_by_depth(topology, depth);
  assert(width == 1);
  /* 2 groups at depth 2 */
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP);
  assert(depth == 2);
  width = hwloc_get_nbobjs_by_depth(topology, depth);
  assert(width == 2);
  /* 5 packages at depth 3 */
  depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE);
  assert(depth == 3);
  width = hwloc_get_nbobjs_by_depth(topology, depth);
  assert(width == 5);
  /* find the node obj */
  obj = hwloc_get_root_obj(topology);
  assert(obj->arity == 1);
  obj = obj->children[0];
  assert(obj->type == HWLOC_OBJ_NUMANODE);
  assert(obj->arity == 3);
  /* check its children */
  assert(obj->children[0]->type == HWLOC_OBJ_GROUP);
  assert(obj->children[0]->depth == 2);
  assert(obj->children[0]->arity == 2);
  assert(obj->children[1]->type == HWLOC_OBJ_PACKAGE);
  assert(obj->children[1]->depth == 3);
  assert(obj->children[1]->arity == 1);
  assert(obj->children[2]->type == HWLOC_OBJ_GROUP);
  assert(obj->children[2]->depth == 2);
  assert(obj->children[2]->arity == 2);
  hwloc_topology_destroy(topology);

  return 0;
}
int orte_rmaps_base_compute_bindings(orte_job_t *jdata)
{
    hwloc_obj_type_t hwb, hwm;
    unsigned clvl=0, clvm=0;
    opal_binding_policy_t bind;
    orte_mapping_policy_t map;
    orte_node_t *node;
    int i, rc;
    struct hwloc_topology_support *support;
    bool force_down = false;
    hwloc_cpuset_t totalcpuset;
    int bind_depth, map_depth;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: compute bindings for job %s with policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));

    map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping);
    bind = OPAL_GET_BINDING_POLICY(jdata->map->binding);

    if (ORTE_MAPPING_BYUSER == map) {
        /* user specified binding by rankfile - nothing for us to do */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_CPUSET == bind) {
        int rc;
        /* cpuset was given - setup the bindings */
        if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    if (OPAL_BIND_TO_NONE == bind) {
        /* no binding requested */
        return ORTE_SUCCESS;
    }

    if (OPAL_BIND_TO_BOARD == bind) {
        /* doesn't do anything at this time */
        return ORTE_SUCCESS;
    }

    /* binding requested - convert the binding level to the hwloc obj type */
    switch (bind) {
    case OPAL_BIND_TO_NUMA:
        hwb = HWLOC_OBJ_NODE;
        break;
    case OPAL_BIND_TO_SOCKET:
        hwb = HWLOC_OBJ_SOCKET;
        break;
    case OPAL_BIND_TO_L3CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 3;
        break;
    case OPAL_BIND_TO_L2CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 2;
        break;
    case OPAL_BIND_TO_L1CACHE:
        hwb = HWLOC_OBJ_CACHE;
        clvl = 1;
        break;
    case OPAL_BIND_TO_CORE:
        hwb = HWLOC_OBJ_CORE;
        break;
    case OPAL_BIND_TO_HWTHREAD:
        hwb = HWLOC_OBJ_PU;
        break;
    default:
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* do the same for the mapping policy */
    switch (map) {
    case ORTE_MAPPING_BYNODE:
    case ORTE_MAPPING_BYSLOT:
    case ORTE_MAPPING_SEQ:
        hwm = HWLOC_OBJ_MACHINE;
        break;
    case ORTE_MAPPING_BYDIST:
    case ORTE_MAPPING_BYNUMA:
        hwm = HWLOC_OBJ_NODE;
        break;
    case ORTE_MAPPING_BYSOCKET:
        hwm = HWLOC_OBJ_SOCKET;
        break;
    case ORTE_MAPPING_BYL3CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 3;
        break;
    case ORTE_MAPPING_BYL2CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 2;
        break;
    case ORTE_MAPPING_BYL1CACHE:
        hwm = HWLOC_OBJ_CACHE;
        clvm = 1;
        break;
    case ORTE_MAPPING_BYCORE:
        hwm = HWLOC_OBJ_CORE;
        break;
    case ORTE_MAPPING_BYHWTHREAD:
        hwm = HWLOC_OBJ_PU;
        break;
    default:
        ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
        return ORTE_ERR_BAD_PARAM;
    }

    /* if the job was mapped by the corresponding target, then
     * we bind in place
     *
     * otherwise, we have to bind either up or down the hwloc
     * tree. If we are binding upwards (e.g., mapped to hwthread
     * but binding to core), then we just climb the tree to find
     * the first matching object.
     *
     * if we are binding downwards (e.g., mapped to node and bind
     * to core), then we have to do a round-robin assigment of
     * procs to the resources below.
     */

    if (ORTE_MAPPING_BYDIST == map) {
        int rc = ORTE_SUCCESS;
        if (OPAL_BIND_TO_NUMA == bind) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps: bindings for job %s - dist to numa",
                                ORTE_JOBID_PRINT(jdata->jobid));
            if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) {
                ORTE_ERROR_LOG(rc);
            }
        } else if (OPAL_BIND_TO_NUMA < bind) {
            /* bind every proc downwards */
            force_down = true;
            goto execute;
        }
        /* if the binding policy is less than numa, then we are unbound - so
         * just ignore this and return (should have been caught in prior
         * tests anyway as only options meeting that criteria are "none"
         * and "board")
         */
        return rc;
    }

    /* now deal with the remaining binding policies based on hardware */
    if (bind == map) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps: bindings for job %s - bind in place",
                            ORTE_JOBID_PRINT(jdata->jobid));
        if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) {
            ORTE_ERROR_LOG(rc);
        }
        return rc;
    }

    /* we need to handle the remaining binding options on a per-node
     * basis because different nodes could potentially have different
     * topologies, with different relative depths for the two levels
     */
 execute:
    /* initialize */
    totalcpuset = hwloc_bitmap_alloc();

    for (i=0; i < jdata->map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(jdata->map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                hwloc_bitmap_free(totalcpuset);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    hwloc_bitmap_free(totalcpuset);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        if (force_down) {
            if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        } else {
            /* determine the relative depth on this node */
            if (HWLOC_OBJ_CACHE == hwb) {
                /* must use a unique function because blasted hwloc
                 * just doesn't deal with caches very well...sigh
                 */
                bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, -1);
            } else {
                bind_depth = hwloc_get_type_depth(node->topology, hwb);
            }
            if (0 > bind_depth) {
                /* didn't find such an object */
                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
                               true, hwloc_obj_type_string(hwb), node->name);
                return ORTE_ERR_SILENT;
            }
            if (HWLOC_OBJ_CACHE == hwm) {
                /* must use a unique function because blasted hwloc
                 * just doesn't deal with caches very well...sigh
                 */
                map_depth = hwloc_get_cache_type_depth(node->topology, clvm, -1);
            } else {
                map_depth = hwloc_get_type_depth(node->topology, hwm);
            }
            if (0 > map_depth) {
                /* didn't find such an object */
                orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects",
                               true, hwloc_obj_type_string(hwm), node->name);
                return ORTE_ERR_SILENT;
            }
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s bind_depth: %d map_depth %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                bind_depth, map_depth);
            if (bind_depth > map_depth) {
                if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            } else {
                if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
            }
        }
    }

    return ORTE_SUCCESS;
}
Exemple #21
0
int
pocl_topology_detect_device_info(cl_device_id device)
{
  hwloc_topology_t pocl_topology;
  int ret = 0;

#ifdef HWLOC_API_2
  if (hwloc_get_api_version () < 0x20000)
    POCL_MSG_ERR ("pocl was compiled against libhwloc 2.x but is"
                  "actually running against libhwloc 1.x \n");
#else
  if (hwloc_get_api_version () >= 0x20000)
    POCL_MSG_ERR ("pocl was compiled against libhwloc 1.x but is"
                  "actually running against libhwloc 2.x \n");
#endif

  /*

   * hwloc's OpenCL backend causes problems at the initialization stage
   * because it reloads libpocl.so via the ICD loader.
   *
   * See: https://github.com/pocl/pocl/issues/261
   *
   * The only trick to stop hwloc from initializing the OpenCL plugin
   * I could find is to point the plugin search path to a place where there
   * are no plugins to be found.
   */
  setenv ("HWLOC_PLUGINS_PATH", "/dev/null", 1);

  ret = hwloc_topology_init (&pocl_topology);
  if (ret == -1)
  {
    POCL_MSG_ERR ("Cannot initialize the topology.\n");
    return ret;
  }

#ifdef HWLOC_API_2
  hwloc_topology_set_io_types_filter(pocl_topology, HWLOC_TYPE_FILTER_KEEP_NONE);
  hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_SYSTEM, HWLOC_TYPE_FILTER_KEEP_NONE);
  hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_GROUP, HWLOC_TYPE_FILTER_KEEP_NONE);
  hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_BRIDGE, HWLOC_TYPE_FILTER_KEEP_NONE);
  hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_MISC, HWLOC_TYPE_FILTER_KEEP_NONE);
  hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_NONE);
  hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_OS_DEVICE, HWLOC_TYPE_FILTER_KEEP_NONE);
#else
  hwloc_topology_ignore_type (pocl_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO);
  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_SYSTEM);
  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_GROUP);
  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_BRIDGE);
  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_MISC);
  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_PCI_DEVICE);
  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_OS_DEVICE);
#endif

  ret = hwloc_topology_load (pocl_topology);
  if (ret == -1)
  {
    POCL_MSG_ERR ("Cannot load the topology.\n");
    goto exit_destroy;
  }

#ifdef HWLOC_API_2
  device->global_mem_size =
      hwloc_get_root_obj(pocl_topology)->total_memory;
#else
  device->global_mem_size =
      hwloc_get_root_obj(pocl_topology)->memory.total_memory;
#endif

  // Try to get the number of CPU cores from topology
  int depth = hwloc_get_type_depth(pocl_topology, HWLOC_OBJ_PU);
  if(depth != HWLOC_TYPE_DEPTH_UNKNOWN)
    device->max_compute_units = hwloc_get_nbobjs_by_depth(pocl_topology, depth);

  /* Find information about global memory cache by looking at the first
   * cache covering the first PU */
  do {
      size_t cache_size = 0, cacheline_size = 0;

      hwloc_obj_t core
          = hwloc_get_next_obj_by_type (pocl_topology, HWLOC_OBJ_CORE, NULL);
      if (core)
        {
          hwloc_obj_t cache
              = hwloc_get_shared_cache_covering_obj (pocl_topology, core);
          if ((cache) && (cache->attr))
            {
              cacheline_size = cache->attr->cache.linesize;
              cache_size = cache->attr->cache.size;
            }
          else
            core = NULL; /* fallback to L1 cache size */
        }

      hwloc_obj_t pu
          = hwloc_get_next_obj_by_type (pocl_topology, HWLOC_OBJ_PU, NULL);
      if (!core && pu)
        {
          hwloc_obj_t cache
              = hwloc_get_shared_cache_covering_obj (pocl_topology, pu);
          if ((cache) && (cache->attr))
            {
              cacheline_size = cache->attr->cache.linesize;
              cache_size = cache->attr->cache.size;
            }
        }

      if (!cache_size || !cacheline_size)
        break;

      device->global_mem_cache_type
          = 0x2; // CL_READ_WRITE_CACHE, without including all of CL/cl.h
      device->global_mem_cacheline_size = cacheline_size;
      device->global_mem_cache_size = cache_size;
  } while (0);

  // Destroy topology object and return
exit_destroy:
  hwloc_topology_destroy (pocl_topology);
  return ret;

}
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	uint32_t i;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;

	uint32_t pfirst,plast;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus = jntasks * job->cpus_per_task;
	pid_t    pid = job->envtp->task_pid;

	cpu_bind_type_t bind_type;
	int verbose;

	hwloc_topology_t topology;
#if HWLOC_API_VERSION <= 0x00010000
	hwloc_cpuset_t cpuset,ct;
#else
	hwloc_bitmap_t cpuset,ct;
#endif
	hwloc_obj_t obj;
	struct hwloc_obj *pobj;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int hwdepth;

	size_t tssize;
	cpu_set_t ts;

	bind_type = job->cpu_bind_type ;
	if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
	    bind_type & CPU_BIND_VERBOSE)
		verbose = 1 ;

	if (bind_type & CPU_BIND_NONE) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = HWLOC_OBJ_SOCKET;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
#if HWLOC_API_VERSION <= 0x00010000
	cpuset = hwloc_cpuset_alloc() ;
#else
	cpuset = hwloc_bitmap_alloc() ;
#endif

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	hwloc_topology_load(topology);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);
	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	     bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = HWLOC_OBJ_SOCKET;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	     nsockets >= nldoms &&
	     bind_type & CPU_BIND_TO_LDOMS) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * Perform a block binding on the detected object respecting the
	 * granularity.
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {

		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid,hwloc_obj_type_string(hwtype));

	} else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
		    jnpus > nobj) {

		info("task/cgroup: task[%u] not enough %s objects, disabling "
		     "affinity",taskid,hwloc_obj_type_string(hwtype));

	} else {

		if (verbose) {
			info("task/cgroup: task[%u] using %s granularity",
			     taskid,hwloc_obj_type_string(hwtype));
		}
		if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
			/* cores or threads granularity */
			pfirst = taskid *  job->cpus_per_task ;
			plast = pfirst + job->cpus_per_task - 1;
		} else {
			/* sockets or ldoms granularity */
			pfirst = taskid;
			plast = pfirst;
		}

		hwdepth = hwloc_get_type_depth(topology,hwtype);
		for (i = pfirst; i <= plast && i < nobj ; i++) {
			obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i);

			/* if requested binding overlap the granularity */
			/* use the ancestor cpuset instead of the object one */
			if (hwloc_compare_types(hwtype,req_hwtype) > 0) {

				/* Get the parent object of req_hwtype or the */
				/* one just above if not found (meaning of >0)*/
				/* (useful for ldoms binding with !NUMA nodes)*/
				pobj = obj->parent;
				while (pobj != NULL &&
					hwloc_compare_types(pobj->type,
							    req_hwtype) > 0)
					pobj = pobj->parent;

				if (pobj != NULL) {
					if (verbose)
						info("task/cgroup: task[%u] "
						     "higher level %s found",
						     taskid,
						     hwloc_obj_type_string(
							     pobj->type));
#if HWLOC_API_VERSION <= 0x00010000
					ct = hwloc_cpuset_dup(pobj->
							      allowed_cpuset);
					hwloc_cpuset_or(cpuset,cpuset,ct);
					hwloc_cpuset_free(ct);
#else
					ct = hwloc_bitmap_dup(pobj->
							      allowed_cpuset);
					hwloc_bitmap_or(cpuset,cpuset,ct);
					hwloc_bitmap_free(ct);
#endif
				} else {
					/* should not be executed */
					if (verbose)
						info("task/cgroup: task[%u] "
						     "no higher level found",
						     taskid);
#if HWLOC_API_VERSION <= 0x00010000
					ct = hwloc_cpuset_dup(obj->
							      allowed_cpuset);
					hwloc_cpuset_or(cpuset,cpuset,ct);
					hwloc_cpuset_free(ct);
#else
					ct = hwloc_bitmap_dup(obj->
							      allowed_cpuset);
					hwloc_bitmap_or(cpuset,cpuset,ct);
					hwloc_bitmap_free(ct);
#endif
				}

			} else {
#if HWLOC_API_VERSION <= 0x00010000
				ct = hwloc_cpuset_dup(obj->allowed_cpuset);
				hwloc_cpuset_or(cpuset,cpuset,ct);
				hwloc_cpuset_free(ct);
#else
				ct = hwloc_bitmap_dup(obj->allowed_cpuset);
				hwloc_bitmap_or(cpuset,cpuset,ct);
				hwloc_bitmap_free(ct);
#endif
			}
		}

		char *str;
#if HWLOC_API_VERSION <= 0x00010000
		hwloc_cpuset_asprintf(&str,cpuset);
#else
		hwloc_bitmap_asprintf(&str,cpuset);
#endif
		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
							  &ts,tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if (sched_setaffinity(pid,tssize,&ts)) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'",taskid,str);
				fstatus = SLURM_ERROR;
			} else if (verbose) {
				info("task/cgroup: task[%u] taskset '%s' is set"
				     ,taskid,str);
			}
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);

	}

	/* Destroy hwloc objects */
#if HWLOC_API_VERSION <= 0x00010000
	hwloc_cpuset_free(cpuset);
#else
	hwloc_bitmap_free(cpuset);
#endif
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
static int bind_in_place(orte_job_t *jdata,
                         hwloc_obj_type_t target,
                         unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node downwards
     * until we find an unused object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind in place for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* ignore procs that have already been bound - should
             * never happen, but safer
             */
            if (NULL != proc->cpu_bitmap) {
                continue;
            }
            /* get the index of this location */
            if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) {
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                return ORTE_ERR_SILENT;
            }
            /* track the number bound */
            data = (opal_hwloc_obj_data_t*)proc->locale->userdata;
            data->num_bound++;
             opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BINDING PROC %s TO %s NUMBER %u",
                                ORTE_NAME_PRINT(&proc->name),
                                hwloc_obj_type_string(proc->locale->type), idx);
            /* get the number of cpus under this location */
            if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* error out if adding a proc would cause overload and that wasn't allowed,
             * and it wasn't a default binding policy (i.e., the user requested it)
             */
            if (ncpus < data->num_bound &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) &&
                OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                               opal_hwloc_base_print_binding(map->binding), node->name,
                               data->num_bound, ncpus);
                return ORTE_ERR_SILENT;
            }
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale);
            hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus);
            /* record the location */
            proc->bind_location = proc->locale;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name),
                                proc->cpu_bitmap,
                                hwloc_obj_type_string(proc->locale->type),
                                idx, node->name);
        }
    }

    return ORTE_SUCCESS;
}
Exemple #24
0
static int bind_in_place(orte_job_t *jdata,
                         hwloc_obj_type_t target,
                         unsigned cache_level)
{
    /* traverse the hwloc topology tree on each node downwards
     * until we find an unused object of type target - and then bind
     * the process to that target
     */
    int i, j;
    orte_job_map_t *map;
    orte_node_t *node;
    orte_proc_t *proc;
    hwloc_cpuset_t cpus;
    unsigned int idx, ncpus;
    struct hwloc_topology_support *support;
    opal_hwloc_obj_data_t *data;
    hwloc_obj_t locale, sib;
    char *cpu_bitmap;
    bool found;

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps: bind in place for job %s with bindings %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        opal_hwloc_base_print_binding(jdata->map->binding));
    /* initialize */
    map = jdata->map;

    for (i=0; i < map->nodes->size; i++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            continue;
        }
        if (!orte_do_not_launch) {
            /* if we don't want to launch, then we are just testing the system,
             * so ignore questions about support capabilities
             */
            support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology);
            /* check if topology supports cpubind - have to be careful here
             * as Linux doesn't currently support thread-level binding. This
             * may change in the future, though, and it isn't clear how hwloc
             * interprets the current behavior. So check both flags to be sure.
             */
            if (!support->cpubind->set_thisproc_cpubind &&
                !support->cpubind->set_thisthread_cpubind) {
                if (!OPAL_BINDING_REQUIRED(map->binding) ||
                    !OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                    /* we are not required to bind, so ignore this */
                    continue;
                }
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* check if topology supports membind - have to be careful here
             * as hwloc treats this differently than I (at least) would have
             * expected. Per hwloc, Linux memory binding is at the thread,
             * and not process, level. Thus, hwloc sets the "thisproc" flag
             * to "false" on all Linux systems, and uses the "thisthread" flag
             * to indicate binding capability - don't warn if the user didn't
             * specifically request binding
             */
            if (!support->membind->set_thisproc_membind &&
                !support->membind->set_thisthread_membind &&
                OPAL_BINDING_POLICY_IS_SET(map->binding)) {
                if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name);
                    membind_warned = true;
                } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) {
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name);
                    return ORTE_ERR_SILENT;
                }
            }
        }

        /* some systems do not report cores, and so we can get a situation where our
         * default binding policy will fail for no necessary reason. So if we are
         * computing a binding due to our default policy, and no cores are found
         * on this node, just silently skip it - we will not bind
         */
        if (!OPAL_BINDING_POLICY_IS_SET(map->binding) &&
            HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "Unable to bind-to core by default on node %s as no cores detected",
                                node->name);
            continue;
        }

        /* we share topologies in order
         * to save space, so we need to reset the usage info to reflect
         * our own current state
         */
        reset_usage(node, jdata->jobid);

        /* cycle thru the procs */
        for (j=0; j < node->procs->size; j++) {
            if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                continue;
            }
            /* ignore procs from other jobs */
            if (proc->name.jobid != jdata->jobid) {
                continue;
            }
            /* bozo check */
            if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name));
                return ORTE_ERR_SILENT;
            }
            /* get the index of this location */
            if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) {
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                return ORTE_ERR_SILENT;
            }
            data = (opal_hwloc_obj_data_t*)locale->userdata;
            /* get the number of cpus under this location */
            if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) {
                orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name);
                return ORTE_ERR_SILENT;
            }
            /* if we don't have enough cpus to support this additional proc, try
             * shifting the location to a cousin that can support it - the important
             * thing is that we maintain the same level in the topology */
            if (ncpus < (data->num_bound+1)) {
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "%s bind_in_place: searching right",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                sib = locale;
                found = false;
                while (NULL != (sib = sib->next_cousin)) {
                    data = (opal_hwloc_obj_data_t*)sib->userdata;
                    ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                    if (data->num_bound < ncpus) {
                        found = true;
                        locale = sib;
                        break;
                    }
                }
                if (!found) {
                    /* try the other direction */
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "%s bind_in_place: searching left",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    sib = locale;
                    while (NULL != (sib = sib->prev_cousin)) {
                        data = (opal_hwloc_obj_data_t*)sib->userdata;
                        ncpus = opal_hwloc_base_get_npus(node->topology, sib);
                        if (data->num_bound < ncpus) {
                            found = true;
                            locale = sib;
                            break;
                        }
                    }
                }
                if (!found) {
                    /* no place to put this - see if overload is allowed */
                    if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) {
                        if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
                            /* if the user specified a binding policy, then we cannot meet
                             * it since overload isn't allowed, so error out - have the
                             * message indicate that setting overload allowed will remove
                             * this restriction */
                            orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true,
                                           opal_hwloc_base_print_binding(map->binding), node->name,
                                           data->num_bound, ncpus);
                            return ORTE_ERR_SILENT;
                        } else {
                            /* if we have the default binding policy, then just don't bind */
                            OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE);
                            unbind_procs(jdata);
                            return ORTE_SUCCESS;
                        }
                    }
                }
            }
            /* track the number bound */
            data = (opal_hwloc_obj_data_t*)locale->userdata;  // just in case it changed
            data->num_bound++;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "BINDING PROC %s TO %s NUMBER %u",
                                ORTE_NAME_PRINT(&proc->name),
                                hwloc_obj_type_string(locale->type), idx);
            /* bind the proc here */
            cpus = opal_hwloc_base_get_available_cpus(node->topology, locale);
            hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus);
            orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
            /* update the location, in case it changed */
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR);
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "%s BOUND PROC %s TO %s[%s:%u] on node %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&proc->name),
                                cpu_bitmap, hwloc_obj_type_string(locale->type),
                                idx, node->name);
            if (NULL != cpu_bitmap) {
                free(cpu_bitmap);
            }
        }
    }

    return ORTE_SUCCESS;
}
Exemple #25
0
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	hwloc_obj_type_t socket_or_node;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus = jntasks * job->cpus_per_task;
	pid_t    pid = job->envtp->task_pid;

	cpu_bind_type_t bind_type;
	int bind_verbose = 0;

	hwloc_topology_t topology;
	hwloc_bitmap_t cpuset;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;

	size_t tssize;
	cpu_set_t ts;

	bind_type = job->cpu_bind_type ;
	if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
	    bind_type & CPU_BIND_VERBOSE)
		bind_verbose = 1 ;

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);

	cpuset = hwloc_bitmap_alloc();

	hwloc_topology_load(topology);
	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
		/* One socket contains multiple NUMA-nodes
		 * like AMD Opteron 6000 series etc.
		 * In such case, use NUMA-node instead of socket. */
		socket_or_node = HWLOC_OBJ_NODE;
	} else {
		socket_or_node = HWLOC_OBJ_SOCKET;
	}

	if (bind_type & CPU_BIND_NONE) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = socket_or_node;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (bind_verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       socket_or_node);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);

	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	    bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = socket_or_node;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	    nsockets >= nldoms &&
	    bind_type & CPU_BIND_TO_LDOMS) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * Bind the detected object to the taskid, respecting the
	 * granularity, using the designated or default distribution
	 * method (block or cyclic).
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {

		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid,hwloc_obj_type_string(hwtype));

	} else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
		   jnpus > nobj) {

		info("task/cgroup: task[%u] not enough %s objects, disabling "
		     "affinity",taskid,hwloc_obj_type_string(hwtype));

	} else {
		char *str;

		if (bind_verbose) {
			info("task/cgroup: task[%u] using %s granularity",
			     taskid,hwloc_obj_type_string(hwtype));
		}

		/* There are two "distributions,"  controlled by the
		 * -m option of srun and friends. The first is the
		 * distribution of tasks to nodes.  The second is the
		 * distribution of allocated cpus to tasks for
		 * binding.  This code is handling the second
		 * distribution.  Here's how the values get set, based
		 * on the value of -m
		 *
		 * SLURM_DIST_CYCLIC = srun -m cyclic
		 * SLURM_DIST_BLOCK = srun -m block
		 * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic
		 * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic
		 *
		 * In the first two cases, the user only specified the
		 * first distribution.  The second distribution
		 * defaults to cyclic.  In the second two cases, the
		 * user explicitly requested a second distribution of
		 * cyclic.  So all these four cases correspond to a
		 * second distribution of cyclic.   So we want to call
		 * _task_cgroup_cpuset_dist_cyclic.
		 *
		 * If the user explicitly specifies a second
		 * distribution of block, or if
		 * CR_CORE_DEFAULT_DIST_BLOCK is configured and the
		 * user does not explicitly specify a second
		 * distribution of cyclic, the second distribution is
		 * block, and we need to call
		 * _task_cgroup_cpuset_dist_block. In these cases,
		 * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK
		 * or SLURM_DIST_BLOCK_BLOCK.
		 *
		 * You can see the equivalent code for the
		 * task/affinity plugin in
		 * src/plugins/task/affinity/dist_tasks.c, around line 384.
		 */
		switch (job->task_dist) {
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC_CYCLIC:
		case SLURM_DIST_BLOCK_CYCLIC:
			_task_cgroup_cpuset_dist_cyclic(
				topology, hwtype, req_hwtype,
				job, bind_verbose, cpuset);
			break;
		default:
			_task_cgroup_cpuset_dist_block(
				topology, hwtype, req_hwtype,
				nobj, job, bind_verbose, cpuset);
		}

		hwloc_bitmap_asprintf(&str, cpuset);

		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
							 &ts,tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if (sched_setaffinity(pid,tssize,&ts)) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'",taskid,str);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] taskset '%s' is set"
				     ,taskid,str);
			}
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);
	}

	/* Destroy hwloc objects */
	hwloc_bitmap_free(cpuset);

	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
static int _task_cgroup_cpuset_dist_block(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, uint32_t nobj,
	stepd_step_rec_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t core_loop, ntskip, npdist;
	uint32_t i, j, pfirst, plast;
	uint32_t taskid = job->envtp->localid;
	int hwdepth;
	uint32_t npus, ncores, nsockets;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t *thread_idx;
	uint32_t core_idx;
	bool core_fcyclic, core_block;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	core_block = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_COREBLOCK ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	thread_idx = xmalloc(ncores * sizeof(uint32_t));

	if (bind_verbose) {
		info("task/cgroup: task[%u] using block distribution, "
		     "task_dist 0x%x", taskid, job->task_dist);
	}

	if ((hwloc_compare_types(hwtype, HWLOC_OBJ_PU) == 0) && !core_block) {
		thread_idx = xmalloc(ncores * sizeof(uint32_t));
		ntskip = taskid;
		npdist = job->cpus_per_task;

		i = 0; j = 0;
		core_idx = 0;
		core_loop = 0;
		while (i < ntskip + 1 && core_loop < npdist + 1) {
			while ((core_idx < ncores) && (j < npdist)) {
				obj = hwloc_get_obj_below_by_type(
					topology, HWLOC_OBJ_CORE, core_idx,
					hwtype, thread_idx[core_idx]);
				if (obj != NULL) {
					thread_idx[core_idx]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
					if ((j < npdist) && core_fcyclic)
						core_idx++;
				} else {
					core_idx++;
				}
			}
			if (j == npdist) {
				i++; j = 0;
				core_idx++; // no validity check, handled by the while
				core_loop = 0;
			} else {
				core_loop++;
				core_idx = 0;
			}
		}
		xfree(thread_idx);

		/* should never happen in normal scenario */
		if (core_loop > npdist) {
			error("task/cgroup: task[%u] infinite loop broken while "
			      "trying to provision compute elements using %s",
			      taskid, format_task_dist_states(job->task_dist));
			return XCGROUP_ERROR;
		} else
			return XCGROUP_SUCCESS;
	}

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		pfirst = taskid * job->cpus_per_task ;
		plast = pfirst + job->cpus_per_task - 1;
	} else {
		/* sockets or ldoms granularity */
		pfirst = taskid;
		plast = pfirst;
	}

	hwdepth = hwloc_get_type_depth(topology, hwtype);
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)  &&
	    (nsockets != 0)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = MAX(1, (ncores / nsockets));
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= pfirst && i < npus; i++) {
				if (bit_test(spec_threads, i))
					pfirst++;
			};
		}
	}

	for (i = pfirst; i <= plast && i < nobj ; i++) {
		obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
		_add_hwloc_cpuset(hwtype, req_hwtype, obj, taskid,
			    bind_verbose, cpuset);
	}

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	return XCGROUP_SUCCESS;
}
int main(void)
{
    int depth;
    unsigned i, n;
    unsigned long size;
    int levels;
    char string[128];
    int topodepth;
    void *m;
    hwloc_topology_t topology;
    hwloc_cpuset_t cpuset;
    hwloc_obj_t obj;

    /* Allocate and initialize topology object. */
    hwloc_topology_init(&topology);

    /* ... Optionally, put detection configuration here to ignore
       some objects types, define a synthetic topology, etc....  

       The default is to detect all the objects of the machine that
       the caller is allowed to access.  See Configure Topology
       Detection. */

    /* Perform the topology detection. */
    hwloc_topology_load(topology);

    /* Optionally, get some additional topology information
       in case we need the topology depth later. */
    topodepth = hwloc_topology_get_depth(topology);

    /*****************************************************************
     * First example:
     * Walk the topology with an array style, from level 0 (always
     * the system level) to the lowest level (always the proc level).
     *****************************************************************/
    for (depth = 0; depth < topodepth; depth++) {
        printf("*** Objects at level %d\n", depth);
        for (i = 0; i < hwloc_get_nbobjs_by_depth(topology, depth); 
             i++) {
            hwloc_obj_type_snprintf(string, sizeof(string),
				    hwloc_get_obj_by_depth(topology, depth, i), 0);
            printf("Index %u: %s\n", i, string);
        }
    }

    /*****************************************************************
     * Second example:
     * Walk the topology with a tree style.
     *****************************************************************/
    printf("*** Printing overall tree\n");
    print_children(topology, hwloc_get_root_obj(topology), 0);

    /*****************************************************************
     * Third example:
     * Print the number of packages.
     *****************************************************************/
    depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE);
    if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) {
        printf("*** The number of packages is unknown\n");
    } else {
        printf("*** %u package(s)\n",
               hwloc_get_nbobjs_by_depth(topology, depth));
    }

    /*****************************************************************
     * Fourth example:
     * Compute the amount of cache that the first logical processor
     * has above it.
     *****************************************************************/
    levels = 0;
    size = 0;
    for (obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, 0);
         obj;
         obj = obj->parent)
      if (obj->type == HWLOC_OBJ_CACHE) {
        levels++;
        size += obj->attr->cache.size;
      }
    printf("*** Logical processor 0 has %d caches totaling %luKB\n", 
           levels, size / 1024);

    /*****************************************************************
     * Fifth example:
     * Bind to only one thread of the last core of the machine.
     *
     * First find out where cores are, or else smaller sets of CPUs if
     * the OS doesn't have the notion of a "core".
     *****************************************************************/
    depth = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_CORE);

    /* Get last core. */
    obj = hwloc_get_obj_by_depth(topology, depth,
                   hwloc_get_nbobjs_by_depth(topology, depth) - 1);
    if (obj) {
        /* Get a copy of its cpuset that we may modify. */
        cpuset = hwloc_bitmap_dup(obj->cpuset);

        /* Get only one logical processor (in case the core is
           SMT/hyper-threaded). */
        hwloc_bitmap_singlify(cpuset);

        /* And try to bind ourself there. */
        if (hwloc_set_cpubind(topology, cpuset, 0)) {
            char *str;
            int error = errno;
            hwloc_bitmap_asprintf(&str, obj->cpuset);
            printf("Couldn't bind to cpuset %s: %s\n", str, strerror(error));
            free(str);
        }

        /* Free our cpuset copy */
        hwloc_bitmap_free(cpuset);
    }

    /*****************************************************************
     * Sixth example:
     * Allocate some memory on the last NUMA node, bind some existing
     * memory to the last NUMA node.
     *****************************************************************/
    /* Get last node. There's always at least one. */
    n = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE);
    obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, n - 1);

    size = 1024*1024;
    m = hwloc_alloc_membind_nodeset(topology, size, obj->nodeset,
                                    HWLOC_MEMBIND_BIND, 0);
    hwloc_free(topology, m, size);

    m = malloc(size);
    hwloc_set_area_membind_nodeset(topology, m, size, obj->nodeset,
                                   HWLOC_MEMBIND_BIND, 0);
    free(m);

    /* Destroy topology object. */
    hwloc_topology_destroy(topology);

    return 0;
}
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	char mstr[1 + CPU_SETSIZE / 4];
	cpu_bind_type_t bind_type;
	cpu_set_t ts;
	hwloc_obj_t obj;
	hwloc_obj_type_t socket_or_node;
	hwloc_topology_t topology;
	hwloc_bitmap_t cpuset;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int bind_verbose = 0;
	int rc = SLURM_SUCCESS, match;
	pid_t    pid = job->envtp->task_pid;
	size_t tssize;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus;

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
	hwloc_topology_load(topology);
	cpuset = hwloc_bitmap_alloc();

	int spec_threads = 0;

	if (job->batch) {
		jnpus = job->cpus;
		job->cpus_per_task = job->cpus;
	} else
		jnpus = jntasks * job->cpus_per_task;

	bind_type = job->cpu_bind_type;
	if ((conf->task_plugin_param & CPU_BIND_VERBOSE) ||
	    (bind_type & CPU_BIND_VERBOSE))
		bind_verbose = 1 ;

	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
		/* One socket contains multiple NUMA-nodes
		 * like AMD Opteron 6000 series etc.
		 * In such case, use NUMA-node instead of socket. */
		socket_or_node = HWLOC_OBJ_NODE;
	} else {
		socket_or_node = HWLOC_OBJ_SOCKET;
	}

	if (bind_type & CPU_BIND_NONE) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = socket_or_node;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else if (bind_type & CPU_BIND_TO_BOARDS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "board level binding",taskid);
		req_hwtype = HWLOC_OBJ_GROUP;
	} else if (bind_type & bind_mode_ldom) {
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (bind_verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       socket_or_node);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);
	//info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms);

	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)) {
		spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD);
	}
	if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	    bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = socket_or_node;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	    nsockets >= nldoms &&
	    bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) {
		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid, hwloc_obj_type_string(hwtype));

	} else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) &&
		   (nobj < jnpus)) {
		info("task/cgroup: task[%u] not enough %s objects (%d < %d), "
		     "disabling affinity",
		     taskid, hwloc_obj_type_string(hwtype), nobj, jnpus);

	} else if (bind_type & bind_mode) {
		/* Explicit binding mode specified by the user
		 * Bind the taskid in accordance with the specified mode
		 */
		obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0);
		match = hwloc_bitmap_isequal(obj->complete_cpuset,
					     obj->allowed_cpuset);
		if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) {
			info("task/cgroup: entire node must be allocated, "
			     "disabling affinity, task[%u]", taskid);
			fprintf(stderr, "Requested cpu_bind option requires "
				"entire node to be allocated; disabling "
				"affinity\n");
		} else {
			if (bind_verbose) {
				info("task/cgroup: task[%u] is requesting "
				     "explicit binding mode", taskid);
			}
			_get_sched_cpuset(topology, hwtype, req_hwtype, &ts,
					  job);
			tssize = sizeof(cpu_set_t);
			fstatus = SLURM_SUCCESS;
			if (job->job_core_spec != (uint16_t) NO_VAL)
				_validate_mask(taskid, obj, &ts);
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "mask 0x%s", taskid,
				      cpuset_to_str(&ts, mstr));
				error("sched_setaffinity rc = %d", rc);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] mask 0x%s",
				     taskid, cpuset_to_str(&ts, mstr));
			}
			_slurm_chkaffinity(&ts, job, rc);
		}
	} else {
		/* Bind the detected object to the taskid, respecting the
		 * granularity, using the designated or default distribution
		 * method (block or cyclic). */
		char *str;

		if (bind_verbose) {
			info("task/cgroup: task[%u] using %s granularity dist %u",
			     taskid, hwloc_obj_type_string(hwtype),
			     job->task_dist);
		}

		/* See srun man page for detailed information on --distribution
		 * option.
		 *
		 * You can see the equivalent code for the
		 * task/affinity plugin in
		 * src/plugins/task/affinity/dist_tasks.c, around line 368
		 */
		switch (job->task_dist & SLURM_DIST_NODESOCKMASK) {
		case SLURM_DIST_BLOCK_BLOCK:
		case SLURM_DIST_CYCLIC_BLOCK:
		case SLURM_DIST_PLANE:
			/* tasks are distributed in blocks within a plane */
			_task_cgroup_cpuset_dist_block(topology,
				hwtype, req_hwtype,
				nobj, job, bind_verbose, cpuset);
			break;
		case SLURM_DIST_ARBITRARY:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_UNKNOWN:
			if (slurm_get_select_type_param()
			    & CR_CORE_DEFAULT_DIST_BLOCK) {
				_task_cgroup_cpuset_dist_block(topology,
					hwtype, req_hwtype,
					nobj, job, bind_verbose, cpuset);
				break;
			}
			/* We want to fall through here if we aren't doing a
			   default dist block.
			*/
		default:
			_task_cgroup_cpuset_dist_cyclic(topology,
				hwtype, req_hwtype,
				job, bind_verbose, cpuset);
			break;
		}

		hwloc_bitmap_asprintf(&str, cpuset);

		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset,
							 &ts, tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'", taskid, str);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] set taskset '%s'",
				     taskid, str);
			}
			_slurm_chkaffinity(&ts, job, rc);
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);
	}

	/* Destroy hwloc objects */
	hwloc_bitmap_free(cpuset);
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
Exemple #29
0
//Initializes HWLOC and load the machine architecture
int hw_topology_init (struct arch_topology *topo)
{
  hwloc_obj_t obj, core1, core2;
  int count, i, j, error;


  //Create the machine representation
  error = hwloc_topology_init(&topology);
  //Go throught the topology only if HWLOC is 
  //successifully initialized
  if(!error)
  {
    hwloc_topology_load(topology);
    local_topo = malloc(sizeof(struct arch_topology));

#if defined  (__DBCSR_ACC) || defined (__PW_CUDA)
  int nDev;
  ma_get_ndevices_cu(&nDev);
#endif

    //Extract number of NUMA nodes
    if (hwloc_get_type_depth (topology, HWLOC_OBJ_NODE))
     topo->nnodes = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_NODE));
    else
     topo->nnodes = 0;

    //Get number of cores, sockets and processing units
    topo->ncores = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_CORE));
    topo->nsockets = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_SOCKET)); 
    topo->npus = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_PU));

   //Compute number of memory controlers per socket
   //basically the number of NUMA nodes per socket
   if (topo->nnodes > topo->nsockets)
    topo->nmemcontroller = topo->nnodes/topo->nsockets;
   else
    topo->nmemcontroller = 1;

   count = 0; 
   topo->nshared_caches = 0;
   //Get derivate information - get number of cache per PU
   for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,0);
      obj; obj = obj->parent)
   {
	   if (obj->type == HWLOC_OBJ_CACHE)
	   { 
       if (obj->arity>1)
        topo->nshared_caches++; 
       else  
       { count++;  topo->ncaches = count;	}  
     } 
   }

  //Number of direct siblings
  //Siblings cores are the ones that share at least one component
  //level of the architecture
  count = 0;
  core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, 0);
  core2 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, 1);
  obj   = hwloc_get_common_ancestor_obj(topology, core1, core2);
  if (obj) 
    topo->nsiblings = obj->arity;

  //Machine node and core representation
  machine_nodes = (struct node*) malloc (topo->nnodes*sizeof(struct node));
  machine_cores = (struct core*) malloc (topo->ncores*sizeof(struct core));

  phys_cpus = malloc (topo->ncores*sizeof(int));
  get_phys_id(topology, topo->ncores, 0);
 
  //Get the caches sizes and other information for each core  
  for (i = 0; i < topo->ncores ; i++)
   {
	      machine_cores[i].caches = malloc (topo->ncaches*sizeof(size_t));
        machine_cores[i].shared_caches = malloc (topo->ncaches*sizeof(int));

        for (j = 0; j < topo->ncaches; j++)
         machine_cores[i].shared_caches[j] = 0;
        for (j = topo->ncaches ; j > topo->ncaches - topo->nshared_caches; j--)
         machine_cores[i].shared_caches[j-1] = 1;

        machine_cores[i].nsiblings = topo->nsiblings;
        machine_cores[i].siblings_id = malloc (topo->nsiblings*sizeof(unsigned));
        if(topo->ncores == topo->npus){
          core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i);
 	         machine_cores[i].id = core1->os_index;
          count = 0;
          for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,i);
              obj; obj = obj->parent) {
	          if (obj->type == HWLOC_OBJ_CACHE){
	        	  machine_cores[i].caches[count] = obj->attr->cache.size / 1024;
			        count++;
		        }
            if (obj->type == HWLOC_OBJ_NODE)
              machine_cores[i].numaNode = obj->logical_index;                
 	        }
       }
       else{
	        core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i);
 	        machine_cores[i].id = core1->os_index;
          count = 0;
          for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_CORE,i);
              obj; obj = obj->parent) {
	          if (obj->type == HWLOC_OBJ_CACHE) {
    	        	machine_cores[i].caches[count] = obj->attr->cache.size / 1024;
			          count++;
		        }
           if (obj->type == HWLOC_OBJ_NODE)
                machine_cores[i].numaNode = obj->logical_index;
 	        }
       }
   }    

  //Get siblings id - so each core knows its siblings
  for (i = 0; i < topo->ncores ; i++)
   {
        if(topo->ncores == topo->npus){
          core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i);
          set_phys_siblings(i,machine_cores[i].id,core1,topo->ncores,topo->nsiblings,HWLOC_OBJ_PU);
        }
        else{
          core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i);
          set_phys_siblings(i,machine_cores[i].id,core1,topo->ncores,topo->nsiblings,HWLOC_OBJ_CORE);
        }

  }

   int ncore_node = topo->ncores/topo->nnodes;
   int count_cores;
  //Get the information for each NUMAnode
  for (i = 0; i < topo->nnodes ; i++)
   {
        obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NODE, i);
        
      	machine_nodes[i].id = obj->os_index;
      	machine_nodes[i].memory = obj->memory.total_memory;
        machine_nodes[i].ncores = ncore_node;
        machine_nodes[i].mycores = malloc (ncore_node*sizeof(unsigned)); 

        //Get the cores id of each NUMAnode
        count_cores = 0;
        set_node_cores(topology, obj, i, &count_cores);

       //GPU support
#if defined  (__DBCSR_ACC) || defined (__PW_CUDA)
       int *devIds;
       devIds = malloc (nDev*sizeof(int));
       topo->ngpus = nDev;
       ma_get_cu(i,devIds); 
       machine_nodes[i].mygpus = devIds;
#endif    	
   }    

  //counting network cards
  count = 0;
  hwloc_topology_t topo_net;
  error = hwloc_topology_init(&topo_net);  
  hwloc_topology_set_flags(topo_net, HWLOC_TOPOLOGY_FLAG_IO_DEVICES);  
  if (!error){
      hwloc_topology_load(topo_net);
      for (obj = hwloc_get_obj_by_type(topo_net, HWLOC_OBJ_OS_DEVICE, 0);
           obj;
           obj = hwloc_get_next_osdev(topo_net,obj))
        if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK ||
            obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS)
              count++;   
      topo->nnetcards = count;
  } 
  else //if can not load I/O devices
   topo->nnetcards = 0;  
  hwloc_topology_destroy(topo_net); 
  

 /*Local copy of the machine topology components*/  
 local_topo->nnodes = topo->nnodes;
 local_topo->nsockets = topo->nsockets;
 local_topo->ncores = topo->ncores;
 local_topo->npus = topo->npus;
 local_topo->ngpus = topo->ngpus;
 local_topo->ncaches = topo->ncaches;
 local_topo->nshared_caches = topo->nshared_caches;
 local_topo->nsiblings = topo->nsiblings;
 local_topo->nmemcontroller =  topo->nmemcontroller;
 local_topo->nnetcards = topo->nnetcards;
}

 return error;

}
Exemple #30
0
extern int
get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards,
	    uint16_t *p_sockets, uint16_t *p_cores, uint16_t *p_threads,
	    uint16_t *p_block_map_size,
	    uint16_t **p_block_map, uint16_t **p_block_map_inv)
{
	enum { SOCKET=0, CORE=1, PU=2, LAST_OBJ=3 };
	hwloc_topology_t topology;
	hwloc_obj_t obj;
	hwloc_obj_type_t objtype[LAST_OBJ];
	unsigned idx[LAST_OBJ];
	int nobj[LAST_OBJ];
	int actual_cpus;
	int macid;
	int absid;
	int actual_boards = 1, depth;
	int i;

	debug2("hwloc_topology_init");
	if (hwloc_topology_init(&topology)) {
		/* error in initialize hwloc library */
		debug("hwloc_topology_init() failed.");
		return 1;
	}

	/* parse all system */
	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM);

	/* ignores cache, misc */
	hwloc_topology_ignore_type (topology, HWLOC_OBJ_CACHE);
	hwloc_topology_ignore_type (topology, HWLOC_OBJ_MISC);

	/* load topology */
	debug2("hwloc_topology_load");
	if (hwloc_topology_load(topology)) {
		/* error in load hardware topology */
		debug("hwloc_topology_load() failed.");
		hwloc_topology_destroy(topology);
		return 2;
	}

	/* Some processors (e.g. AMD Opteron 6000 series) contain multiple
	 * NUMA nodes per socket. This is a configuration which does not map
	 * into the hardware entities that Slurm optimizes resource allocation
	 * for (PU/thread, core, socket, baseboard, node and network switch).
	 * In order to optimize resource allocations on such hardware, Slurm
	 * will consider each NUMA node within the socket as a separate socket.
	 * You can disable this configuring "SchedulerParameters=Ignore_NUMA",
	 * in which case Slurm will report the correct socket count on the node,
	 * but not be able to optimize resource allocations on the NUMA nodes.
	 */
	objtype[SOCKET] = HWLOC_OBJ_SOCKET;
	objtype[CORE]   = HWLOC_OBJ_CORE;
	objtype[PU]     = HWLOC_OBJ_PU;
	if (hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	    hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)) {
		char *sched_params = slurm_get_sched_params();
		if (sched_params &&
		    strcasestr(sched_params, "Ignore_NUMA")) {
			info("Ignoring NUMA nodes within a socket");
		} else {
			info("Considering each NUMA node as a socket");
			objtype[SOCKET] = HWLOC_OBJ_NODE;
		}
		xfree(sched_params);
	}

	/* number of objects */
	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP);
	if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) {
		actual_boards = MAX(hwloc_get_nbobjs_by_depth(topology, depth),
				    1);
	}
	nobj[SOCKET] = hwloc_get_nbobjs_by_type(topology, objtype[SOCKET]);
	nobj[CORE]   = hwloc_get_nbobjs_by_type(topology, objtype[CORE]);
	/*
	 * Workaround for hwloc
	 * hwloc_get_nbobjs_by_type() returns 0 on some architectures.
	 */
	if ( nobj[SOCKET] == 0 ) {
		debug("get_cpuinfo() fudging nobj[SOCKET] from 0 to 1");
		nobj[SOCKET] = 1;
	}
	if ( nobj[CORE] == 0 ) {
		debug("get_cpuinfo() fudging nobj[CORE] from 0 to 1");
		nobj[CORE] = 1;
	}
	if ( nobj[SOCKET] == -1 )
		fatal("get_cpuinfo() can not handle nobj[SOCKET] = -1");
	if ( nobj[CORE] == -1 )
		fatal("get_cpuinfo() can not handle nobj[CORE] = -1");
	actual_cpus  = hwloc_get_nbobjs_by_type(topology, objtype[PU]);
#if 0
	/* Used to find workaround above */
	info("CORE = %d SOCKET = %d actual_cpus = %d nobj[CORE] = %d",
	     CORE, SOCKET, actual_cpus, nobj[CORE]);
#endif
	nobj[PU]     = actual_cpus/nobj[CORE];  /* threads per core */
	nobj[CORE]  /= nobj[SOCKET];            /* cores per socket */

	debug("CPUs:%d Boards:%u Sockets:%d CoresPerSocket:%d ThreadsPerCore:%d",
	      actual_cpus, actual_boards, nobj[SOCKET], nobj[CORE], nobj[PU]);

	/* allocate block_map */
	*p_block_map_size = (uint16_t)actual_cpus;
	if (p_block_map && p_block_map_inv) {
		*p_block_map     = xmalloc(actual_cpus * sizeof(uint16_t));
		*p_block_map_inv = xmalloc(actual_cpus * sizeof(uint16_t));

		/* initialize default as linear mapping */
		for (i = 0; i < actual_cpus; i++) {
			(*p_block_map)[i]     = i;
			(*p_block_map_inv)[i] = i;
		}
		/* create map with hwloc */
		for (idx[SOCKET]=0; idx[SOCKET]<nobj[SOCKET]; ++idx[SOCKET]) {
			for (idx[CORE]=0; idx[CORE]<nobj[CORE]; ++idx[CORE]) {
				for (idx[PU]=0; idx[PU]<nobj[PU]; ++idx[PU]) {
					/* get hwloc_obj by indexes */
					obj=hwloc_get_obj_below_array_by_type(
					            topology, 3, objtype, idx);
					if (!obj)
						continue;
					macid = obj->os_index;
					absid = idx[SOCKET]*nobj[CORE]*nobj[PU]
					      + idx[CORE]*nobj[PU]
					      + idx[PU];

					if ((macid >= actual_cpus) ||
					    (absid >= actual_cpus)) {
						/* physical or logical ID are
						 * out of range */
						continue;
					}
					debug4("CPU map[%d]=>%d", absid, macid);
					(*p_block_map)[absid]     = macid;
					(*p_block_map_inv)[macid] = absid;
				}
			 }
		}
	}

	hwloc_topology_destroy(topology);

	/* update output parameters */
	*p_cpus    = actual_cpus;
	*p_boards  = actual_boards;
	*p_sockets = nobj[SOCKET];
	*p_cores   = nobj[CORE];
	*p_threads = nobj[PU];

#if DEBUG_DETAIL
	/*** Display raw data ***/
	debug("CPUs:%u Boards:%u Sockets:%u CoresPerSocket:%u ThreadsPerCore:%u",
	      *p_cpus, *p_boards, *p_sockets, *p_cores, *p_threads);

	/* Display the mapping tables */
	if (p_block_map && p_block_map_inv) {
		debug("------");
		debug("Abstract -> Machine logical CPU ID block mapping:");
		debug("AbstractId PhysicalId Inverse");
		for (i = 0; i < *p_cpus; i++) {
			debug3("   %4d      %4u       %4u",
				i, (*p_block_map)[i], (*p_block_map_inv)[i]);
		}
		debug("------");
	}
#endif
	return 0;

}