static void check(unsigned nbnodes, unsigned nbcores, unsigned nbpus) { unsigned depth; unsigned nb; unsigned long long total_memory; /* sanity checks */ depth = hwloc_topology_get_depth(topology); assert(depth == 4); depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NODE); assert(depth == 1); depth = hwloc_get_type_depth(topology, HWLOC_OBJ_CORE); assert(depth == 2); depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); assert(depth == 3); /* actual checks */ nb = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); assert(nb == nbnodes); nb = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); assert(nb == nbcores); nb = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); assert(nb == nbpus); total_memory = hwloc_get_root_obj(topology)->memory.total_memory; assert(total_memory == nbnodes * 1024*1024*1024); /* synthetic topology puts 1GB per node */ }
int main(void) { hwloc_topology_t topology; hwloc_obj_t obj; hwloc_topology_init(&topology); hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL); hwloc_topology_load(topology); printf("Found %d bridges\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_BRIDGE)); obj = NULL; while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) { assert(obj->type == HWLOC_OBJ_BRIDGE); /* only host->pci and pci->pci bridge supported so far */ if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST) { assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI); printf(" Found host->PCI bridge for domain %04x bus %02x-%02x\n", obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus); } else { assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI); assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI); printf(" Found PCI->PCI bridge [%04x:%04x] for domain %04x bus %02x-%02x\n", obj->attr->bridge.upstream.pci.vendor_id, obj->attr->bridge.upstream.pci.device_id, obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus); } } printf("Found %d PCI devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE)); obj = NULL; while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) { assert(obj->type == HWLOC_OBJ_PCI_DEVICE); printf(" Found PCI device class %04x vendor %04x model %04x\n", obj->attr->pcidev.class_id, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id); } printf("Found %d OS devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_OS_DEVICE)); obj = NULL; while ((obj = hwloc_get_next_osdev(topology, obj)) != NULL) { assert(obj->type == HWLOC_OBJ_OS_DEVICE); printf(" Found OS device %s subtype %d\n", obj->name, obj->attr->osdev.type); } assert(HWLOC_TYPE_DEPTH_BRIDGE == hwloc_get_type_depth(topology, HWLOC_OBJ_BRIDGE)); assert(HWLOC_TYPE_DEPTH_PCI_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_PCI_DEVICE)); assert(HWLOC_TYPE_DEPTH_OS_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_OS_DEVICE)); assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_PCI_DEVICE) < 0); assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_OS_DEVICE) < 0); assert(hwloc_compare_types(HWLOC_OBJ_PCI_DEVICE, HWLOC_OBJ_OS_DEVICE) < 0); hwloc_topology_destroy(topology); return 0; }
int hwloc_type_sscanf_as_depth(const char *string, hwloc_obj_type_t *typep, hwloc_topology_t topology, int *depthp) { union hwloc_obj_attr_u attr; hwloc_obj_type_t type; int depth; int err; err = hwloc_type_sscanf(string, &type, &attr, sizeof(attr)); if (err < 0) return err; depth = hwloc_get_type_depth(topology, type); if (type == HWLOC_OBJ_GROUP && depth == HWLOC_TYPE_DEPTH_MULTIPLE && attr.group.depth != (unsigned)-1) { unsigned l; depth = HWLOC_TYPE_DEPTH_UNKNOWN; for(l=0; l<topology->nb_levels; l++) { if (topology->levels[l][0]->type == HWLOC_OBJ_GROUP && topology->levels[l][0]->attr->group.depth == attr.group.depth) { depth = l; break; } } } if (typep) *typep = type; *depthp = (unsigned) depth; return 0; }
static int hwloc_solaris_get_sth_membind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused) { int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); int n; int i; if (depth < 0) { errno = ENOSYS; return -1; } hwloc_bitmap_zero(nodeset); n = hwloc_get_nbobjs_by_depth(topology, depth); for (i = 0; i < n; i++) { hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i); lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index); if (aff == LGRP_AFF_STRONG) hwloc_bitmap_set(nodeset, obj->os_index); } if (hwloc_bitmap_iszero(nodeset)) hwloc_bitmap_copy(nodeset, hwloc_topology_get_complete_nodeset(topology)); *policy = HWLOC_MEMBIND_BIND; return 0; }
static void output_compute_pu_min_textwidth(struct lstopo_output *output) { unsigned fontsize = output->fontsize; char text[64]; int n; hwloc_topology_t topology = output->topology; hwloc_obj_t lastpu; if (!output->methods->textsize) { output->min_pu_textwidth = 0; return; } if (output->logical) { int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); lastpu = hwloc_get_obj_by_depth(topology, depth, hwloc_get_nbobjs_by_depth(topology, depth)-1); } else { unsigned lastidx = hwloc_bitmap_last(hwloc_topology_get_topology_cpuset(topology)); lastpu = hwloc_get_pu_obj_by_os_index(topology, lastidx); } n = lstopo_obj_snprintf(output, text, sizeof(text), lastpu); output->min_pu_textwidth = get_textwidth(output, text, n, fontsize); }
/* * Distribute cpus to the task using block distribution */ static int _task_cgroup_cpuset_dist_block( hwloc_topology_t topology, hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype, uint32_t nobj, slurmd_job_t *job, int bind_verbose, hwloc_bitmap_t cpuset) { hwloc_obj_t obj; uint32_t i, pfirst,plast; uint32_t taskid = job->envtp->localid; int hwdepth; if (bind_verbose) info("task/cgroup: task[%u] using block distribution, " "task_dist %u", taskid, job->task_dist); if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ pfirst = taskid * job->cpus_per_task ; plast = pfirst + job->cpus_per_task - 1; } else { /* sockets or ldoms granularity */ pfirst = taskid; plast = pfirst; } hwdepth = hwloc_get_type_depth(topology,hwtype); for (i = pfirst; i <= plast && i < nobj ; i++) { obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i); _add_cpuset(hwtype, req_hwtype, obj, taskid, bind_verbose, cpuset); } return XCGROUP_SUCCESS; }
int PlatformTopology::num_domain(int domain_type) const { int result = 0; try { result = hwloc_get_nbobjs_by_type(m_topo, hwloc_domain(domain_type)); } catch (const Exception &ex) { if (ex.err_value() != GEOPM_ERROR_INVALID) { throw ex; } if (domain_type == GEOPM_DOMAIN_TILE) { /// @todo This assumes that tiles are just below /// package in hwloc hierarchy. If tiles are /// at L2 cache, but processor has an L3 cache, /// this may not be correct. int depth = hwloc_get_type_depth(m_topo, hwloc_domain(GEOPM_DOMAIN_PACKAGE)) + 1; result = hwloc_get_nbobjs_by_depth(m_topo, depth); } else { throw ex; } if (result == 0) { throw ex; } } return result; }
static int hwloc_solaris_get_sth_cpubind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused) { processorid_t binding; int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); int n; int i; if (depth < 0) { errno = ENOSYS; return -1; } /* first check if processor_bind() was used to bind to a single processor rather than to an lgroup */ if ( processor_bind(idtype, id, PBIND_QUERY, &binding) == 0 && binding != PBIND_NONE ) { hwloc_bitmap_only(hwloc_set, binding); return 0; } /* if not, check lgroups */ hwloc_bitmap_zero(hwloc_set); n = hwloc_get_nbobjs_by_depth(topology, depth); for (i = 0; i < n; i++) { hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i); lgrp_affinity_t aff = lgrp_affinity_get(idtype, id, obj->os_index); if (aff == LGRP_AFF_STRONG) hwloc_bitmap_or(hwloc_set, hwloc_set, obj->cpuset); } if (hwloc_bitmap_iszero(hwloc_set)) hwloc_bitmap_copy(hwloc_set, hwloc_topology_get_complete_cpuset(topology)); return 0; }
int hwloc_get_obj_depth_by_name(hwloc_topology_t topology, char * obj_name){ /* find hwloc obj depth */ hwloc_obj_type_t type; int depthattrp; hwloc_obj_cache_type_t cache_type; if(hwloc_obj_type_sscanf(obj_name,&type,&depthattrp,&cache_type,sizeof(cache_type))==-1){ fprintf(stderr,"type \"%s\" was not recognized\n",obj_name); return -1; } int depth = hwloc_get_type_depth(topology,type); if(depth==HWLOC_TYPE_DEPTH_MULTIPLE){ if(type==HWLOC_OBJ_CACHE){ depth = hwloc_get_cache_type_depth(topology,depthattrp,cache_type); if(depth == HWLOC_TYPE_DEPTH_UNKNOWN){ fprintf(stderr,"type %s cannot be found, level=%d\n",obj_name,depthattrp); return -1; } if(depth == HWLOC_TYPE_DEPTH_MULTIPLE){ fprintf(stderr,"type %s multiple caches match for\n",obj_name); return -1; } } else{ fprintf(stderr,"type \"%s\" isn't handled...\n",obj_name); return -1; } } return depth; }
int compute_context_nbr(int * ctxnbr, int threadnbr, int verbose) { #if (defined WITH_STARPU && defined STARPU_CONTEXT) if (*ctxnbr == -1) { int depth; unsigned i, n; int ncpu_per_socket, nsocket, ncpu; hwloc_topology_t topology; hwloc_obj_t obj; /* Allocate and initialize topology object. */ hwloc_topology_init(&topology); /* ... Optionally, put detection configuration here to ignore some objects types, define a synthetic topology, etc.... The default is to detect all the objects of the machine that the caller is allowed to access. See Configure Topology Detection. */ /* Perform the topology detection. */ hwloc_topology_load(topology); depth = hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET); if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) { /* number of socket is unknow, let say we have quadcore... */ ncpu_per_socket = 4; } else { ncpu = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsocket = hwloc_get_nbobjs_by_depth(topology, depth); ncpu_per_socket = ncpu/nsocket; } if (verbose > API_VERBOSE_NO) fprintf(stdout, "ncpu_per_socket %d\n", ncpu_per_socket); nsocket = threadnbr/ncpu_per_socket; if (threadnbr%ncpu_per_socket) nsocket ++; *ctxnbr = nsocket + 1; hwloc_topology_destroy(topology); } if (threadnbr + 1 < *ctxnbr) *ctxnbr = threadnbr+1; if (*ctxnbr == 1) *ctxnbr = 2; /* can't have more than STARPU_NMAX_SCHED_CTXS CTX */ { PASTIX_INT nctx_bot = *ctxnbr - 1; while (*ctxnbr > STARPU_NMAX_SCHED_CTXS) { nctx_bot /= 2; *ctxnbr = nctx_bot + 1; } } #endif /* WITH_STARPU */ return 0; }
magma_int_t magma_get_parallel_numthreads() { // query number of cores magma_int_t ncores = 0; #ifdef HAVE_HWLOC // hwloc gives physical cores, not hyperthreads // from http://stackoverflow.com/questions/12483399/getting-number-of-cores-not-ht-threads hwloc_topology_t topology; hwloc_topology_init( &topology ); hwloc_topology_load( topology ); magma_int_t depth = hwloc_get_type_depth( topology, HWLOC_OBJ_CORE ); if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) { ncores = hwloc_get_nbobjs_by_depth( topology, depth ); } hwloc_topology_destroy( topology ); #endif if ( ncores == 0 ) { #ifdef _MSC_VER // Windows SYSTEM_INFO sysinfo; GetSystemInfo( &sysinfo ); ncores = sysinfo.dwNumberOfProcessors; #else ncores = sysconf( _SC_NPROCESSORS_ONLN ); #endif } // query MAGMA_NUM_THREADS or OpenMP const char *threads_str = getenv("MAGMA_NUM_THREADS"); magma_int_t threads = 0; if ( threads_str != NULL ) { char* endptr; threads = strtol( threads_str, &endptr, 10 ); if ( threads < 1 || *endptr != '\0' ) { threads = 1; fprintf( stderr, "$MAGMA_NUM_THREADS='%s' is an invalid number; using %d thread.\n", threads_str, (int) threads ); } } else { #if defined(_OPENMP) #pragma omp parallel { threads = omp_get_num_threads(); } #else threads = ncores; #endif } // limit to range [1, number of cores] threads = max( 1, min( ncores, threads )); return threads; }
void check_map_event_obj(hwloc_topology_t topology, char * obj_name, char * event_name) { PAPI_event_info_t info; int err, eventcode; hwloc_obj_t obj; unsigned depth = hwloc_get_obj_depth_by_name(topology,obj_name); if((err = PAPI_event_name_to_code(event_name,&eventcode))!=PAPI_OK){ handle_error(err); fprintf(stderr,"could not get \"%s\" event infos\n",event_name); } PAPI_get_event_info(eventcode,&info); switch(info.location){ case PAPI_LOCATION_UNCORE: if(depth >= hwloc_get_type_depth(topology, HWLOC_OBJ_CORE)){ fprintf(stderr,"event %s is an UNCORE event and is actually mapped on %s which is under HWLOC_OBJ_CORE\n", event_name, obj_name); } break; case PAPI_LOCATION_CPU: if(depth > hwloc_get_type_depth(topology, HWLOC_OBJ_MACHINE)){ fprintf(stderr,"event %s is a CPU event and is actually mapped on %s which is strictly under HWLOC_OBJ_MACHINE\n", event_name, obj_name); } break; case PAPI_LOCATION_PACKAGE: if(depth > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)){ fprintf(stderr,"event %s is a PACKAGE event and is actually mapped on %s which is strictly under HWLOC_OBJ_SOCKET\n", event_name, obj_name); } break; default: break; } }
static int _get_ldom_sched_cpuset(hwloc_topology_t topology, hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype, uint32_t ldom, cpu_set_t *mask) { hwloc_obj_t obj; hwloc_bitmap_t cpuset; int hwdepth; cpuset = hwloc_bitmap_alloc(); hwdepth = hwloc_get_type_depth(topology, hwtype); obj = hwloc_get_obj_by_depth(topology, hwdepth, ldom); _add_hwloc_cpuset(hwtype, req_hwtype, obj, 0, 0, cpuset); hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, mask, sizeof(cpu_set_t)); hwloc_bitmap_free(cpuset); return true; }
int get_max_objs_inside_cpuset_by_type(hwloc_topology_t topology, hwloc_cpuset_t cpuset, hwloc_obj_type_t type){ int depth = hwloc_get_type_depth(topology, type); if(depth == HWLOC_TYPE_DEPTH_UNKNOWN){ fprintf(stderr, "Cannot find depth %s\n", hwloc_type_name(type)); return -1; } if(depth == HWLOC_TYPE_DEPTH_MULTIPLE){ hwloc_obj_t deepest_of_type = hwloc_get_obj_inside_cpuset_by_type(topology, cpuset, HWLOC_OBJ_PU,0); while(deepest_of_type !=NULL && deepest_of_type->type != type) deepest_of_type = deepest_of_type->parent; if(deepest_of_type == NULL) return -1; else depth = deepest_of_type->depth; } return hwloc_get_nbobjs_inside_cpuset_by_depth(topology, cpuset, depth); }
static int hwloc_aix_get_sth_membind(hwloc_topology_t topology, rstype_t what, rsid_t who, hwloc_bitmap_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused) { hwloc_bitmap_t hwloc_set; rsethandle_t rset; unsigned cpu, maxcpus; int res = -1; int depth, n, i; depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); if (depth < 0) { errno = EXDEV; return -1; } n = hwloc_get_nbobjs_by_depth(topology, depth); rset = rs_alloc(RS_EMPTY); if (ra_getrset(what, who, 0, rset) == -1) goto out; hwloc_set = hwloc_bitmap_alloc(); maxcpus = rs_getinfo(rset, R_MAXPROCS, 0); for (cpu = 0; cpu < maxcpus; cpu++) if (rs_op(RS_TESTRESOURCE, rset, NULL, R_PROCS, cpu) == 1) hwloc_bitmap_set(hwloc_set, cpu); hwloc_bitmap_and(hwloc_set, hwloc_set, hwloc_topology_get_complete_cpuset(topology)); hwloc_bitmap_zero(nodeset); for (i = 0; i < n; i++) { hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i); if (hwloc_bitmap_isincluded(obj->cpuset, hwloc_set)) hwloc_bitmap_set(nodeset, obj->os_index); } hwloc_bitmap_free(hwloc_set); *policy = HWLOC_MEMBIND_BIND; res = 0; out: rs_free(rset); return res; }
static int dss_topo_init() { hwloc_topology_init(&dss_topo); hwloc_topology_load(dss_topo); dss_core_depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_CORE); dss_core_nr = hwloc_get_nbobjs_by_type(dss_topo, HWLOC_OBJ_CORE); dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads); if (dss_core_offset < 0 || dss_core_offset >= dss_core_nr) { D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), " "should within range [0, %d]", dss_core_offset, dss_core_nr - 1); return -DER_INVAL; } return 0; }
static int hwloc_solaris_set_sth_membind(hwloc_topology_t topology, idtype_t idtype, id_t id, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) { int depth; int n, i; switch (policy) { case HWLOC_MEMBIND_DEFAULT: case HWLOC_MEMBIND_BIND: break; default: errno = ENOSYS; return -1; } if (flags & HWLOC_MEMBIND_NOCPUBIND) { errno = ENOSYS; return -1; } depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); if (depth < 0) { errno = EXDEV; return -1; } n = hwloc_get_nbobjs_by_depth(topology, depth); for (i = 0; i < n; i++) { hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, i); if (hwloc_bitmap_isset(nodeset, obj->os_index)) { lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_STRONG); } else { if (flags & HWLOC_CPUBIND_STRICT) lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_NONE); else lgrp_affinity_set(idtype, id, obj->os_index, LGRP_AFF_WEAK); } } return 0; }
void Hwloc::getNumSockets(unsigned int &allowedNodes, int &numSockets, unsigned int &hwThreads) { #ifdef HWLOC numSockets = 0; // Nodes that can be seen by hwloc allowedNodes = 0; // Hardware threads hwThreads = 0; int depth = hwloc_get_type_depth( _hwlocTopology, HWLOC_OBJ_NODE ); // If there are NUMA nodes in this machine if ( depth != HWLOC_TYPE_DEPTH_UNKNOWN ) { //hwloc_const_cpuset_t cpuset = hwloc_topology_get_online_cpuset( _hwlocTopology ); //allowedNodes = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, cpuset, HWLOC_OBJ_NODE ); //hwThreads = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, cpuset, HWLOC_OBJ_PU ); unsigned nodes = hwloc_get_nbobjs_by_depth( _hwlocTopology, depth ); //hwloc_cpuset_t set = i // For each node, count how many hardware threads there are below. for ( unsigned nodeIdx = 0; nodeIdx < nodes; ++nodeIdx ) { hwloc_obj_t node = hwloc_get_obj_by_depth( _hwlocTopology, depth, nodeIdx ); int localThreads = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, node->cpuset, HWLOC_OBJ_PU ); // Increase hw thread count hwThreads += localThreads; // If this node has hw threads beneath, increase the number of viewable nodes if ( localThreads > 0 ) ++allowedNodes; } numSockets = nodes; } // Otherwise, set it to 1 else { allowedNodes = 1; numSockets = 1; } #else numSockets = 0; allowedNodes = 0; #endif }
int main(void) { hwloc_topology_t topology; hwloc_obj_t obj; unsigned indexes[5]; float distances[5*5]; unsigned depth; unsigned width; /* group 3 numa nodes as 1 group of 2 and 1 on the side */ hwloc_topology_init(&topology); hwloc_topology_set_synthetic(topology, "node:3 pu:1"); indexes[0] = 0; indexes[1] = 1; indexes[2] = 2; distances[0] = 1; distances[1] = 4; distances[2] = 4; distances[3] = 4; distances[4] = 1; distances[5] = 2; distances[6] = 4; distances[7] = 2; distances[8] = 1; hwloc_topology_set_distance_matrix(topology, HWLOC_OBJ_PU, 3, indexes, distances); hwloc_topology_load(topology); /* 2 groups at depth 1 */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP); assert(depth == 1); width = hwloc_get_nbobjs_by_depth(topology, depth); assert(width == 1); /* 3 node at depth 2 */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); assert(depth == 2); width = hwloc_get_nbobjs_by_depth(topology, depth); assert(width == 3); /* find the root obj */ obj = hwloc_get_root_obj(topology); assert(obj->arity == 2); /* check its children */ assert(obj->children[0]->type == HWLOC_OBJ_NUMANODE); assert(obj->children[0]->depth == 2); assert(obj->children[0]->arity == 1); assert(obj->children[1]->type == HWLOC_OBJ_GROUP); assert(obj->children[1]->depth == 1); assert(obj->children[1]->arity == 2); hwloc_topology_destroy(topology); /* group 5 packages as 2 group of 2 and 1 on the side, all of them below a common node object */ hwloc_topology_init(&topology); hwloc_topology_set_synthetic(topology, "node:1 pack:5 pu:1"); indexes[0] = 0; indexes[1] = 1; indexes[2] = 2; indexes[3] = 3; indexes[4] = 4; distances[ 0] = 1; distances[ 1] = 2; distances[ 2] = 4; distances[ 3] = 4; distances[ 4] = 4; distances[ 5] = 2; distances[ 6] = 1; distances[ 7] = 4; distances[ 8] = 4; distances[ 9] = 4; distances[10] = 4; distances[11] = 4; distances[12] = 1; distances[13] = 4; distances[14] = 4; distances[15] = 4; distances[16] = 4; distances[17] = 4; distances[18] = 1; distances[19] = 2; distances[20] = 4; distances[21] = 4; distances[22] = 4; distances[23] = 2; distances[24] = 1; hwloc_topology_set_distance_matrix(topology, HWLOC_OBJ_PACKAGE, 5, indexes, distances); hwloc_topology_load(topology); /* 1 node at depth 1 */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); assert(depth == 1); width = hwloc_get_nbobjs_by_depth(topology, depth); assert(width == 1); /* 2 groups at depth 2 */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP); assert(depth == 2); width = hwloc_get_nbobjs_by_depth(topology, depth); assert(width == 2); /* 5 packages at depth 3 */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE); assert(depth == 3); width = hwloc_get_nbobjs_by_depth(topology, depth); assert(width == 5); /* find the node obj */ obj = hwloc_get_root_obj(topology); assert(obj->arity == 1); obj = obj->children[0]; assert(obj->type == HWLOC_OBJ_NUMANODE); assert(obj->arity == 3); /* check its children */ assert(obj->children[0]->type == HWLOC_OBJ_GROUP); assert(obj->children[0]->depth == 2); assert(obj->children[0]->arity == 2); assert(obj->children[1]->type == HWLOC_OBJ_PACKAGE); assert(obj->children[1]->depth == 3); assert(obj->children[1]->arity == 1); assert(obj->children[2]->type == HWLOC_OBJ_GROUP); assert(obj->children[2]->depth == 2); assert(obj->children[2]->arity == 2); hwloc_topology_destroy(topology); return 0; }
int orte_rmaps_base_compute_bindings(orte_job_t *jdata) { hwloc_obj_type_t hwb, hwm; unsigned clvl=0, clvm=0; opal_binding_policy_t bind; orte_mapping_policy_t map; orte_node_t *node; int i, rc; struct hwloc_topology_support *support; bool force_down = false; hwloc_cpuset_t totalcpuset; int bind_depth, map_depth; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: compute bindings for job %s with policy %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping); bind = OPAL_GET_BINDING_POLICY(jdata->map->binding); if (ORTE_MAPPING_BYUSER == map) { /* user specified binding by rankfile - nothing for us to do */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_CPUSET == bind) { int rc; /* cpuset was given - setup the bindings */ if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { ORTE_ERROR_LOG(rc); } return rc; } if (OPAL_BIND_TO_NONE == bind) { /* no binding requested */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_BOARD == bind) { /* doesn't do anything at this time */ return ORTE_SUCCESS; } /* binding requested - convert the binding level to the hwloc obj type */ switch (bind) { case OPAL_BIND_TO_NUMA: hwb = HWLOC_OBJ_NODE; break; case OPAL_BIND_TO_SOCKET: hwb = HWLOC_OBJ_SOCKET; break; case OPAL_BIND_TO_L3CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 3; break; case OPAL_BIND_TO_L2CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 2; break; case OPAL_BIND_TO_L1CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 1; break; case OPAL_BIND_TO_CORE: hwb = HWLOC_OBJ_CORE; break; case OPAL_BIND_TO_HWTHREAD: hwb = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* do the same for the mapping policy */ switch (map) { case ORTE_MAPPING_BYNODE: case ORTE_MAPPING_BYSLOT: case ORTE_MAPPING_SEQ: hwm = HWLOC_OBJ_MACHINE; break; case ORTE_MAPPING_BYDIST: case ORTE_MAPPING_BYNUMA: hwm = HWLOC_OBJ_NODE; break; case ORTE_MAPPING_BYSOCKET: hwm = HWLOC_OBJ_SOCKET; break; case ORTE_MAPPING_BYL3CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 3; break; case ORTE_MAPPING_BYL2CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 2; break; case ORTE_MAPPING_BYL1CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 1; break; case ORTE_MAPPING_BYCORE: hwm = HWLOC_OBJ_CORE; break; case ORTE_MAPPING_BYHWTHREAD: hwm = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* if the job was mapped by the corresponding target, then * we bind in place * * otherwise, we have to bind either up or down the hwloc * tree. If we are binding upwards (e.g., mapped to hwthread * but binding to core), then we just climb the tree to find * the first matching object. * * if we are binding downwards (e.g., mapped to node and bind * to core), then we have to do a round-robin assigment of * procs to the resources below. */ if (ORTE_MAPPING_BYDIST == map) { int rc = ORTE_SUCCESS; if (OPAL_BIND_TO_NUMA == bind) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - dist to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_NUMA < bind) { /* bind every proc downwards */ force_down = true; goto execute; } /* if the binding policy is less than numa, then we are unbound - so * just ignore this and return (should have been caught in prior * tests anyway as only options meeting that criteria are "none" * and "board") */ return rc; } /* now deal with the remaining binding policies based on hardware */ if (bind == map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - bind in place", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) { ORTE_ERROR_LOG(rc); } return rc; } /* we need to handle the remaining binding options on a per-node * basis because different nodes could potentially have different * topologies, with different relative depths for the two levels */ execute: /* initialize */ totalcpuset = hwloc_bitmap_alloc(); for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(jdata->map->binding) || !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); if (force_down) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* determine the relative depth on this node */ if (HWLOC_OBJ_CACHE == hwb) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, -1); } else { bind_depth = hwloc_get_type_depth(node->topology, hwb); } if (0 > bind_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwb), node->name); return ORTE_ERR_SILENT; } if (HWLOC_OBJ_CACHE == hwm) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ map_depth = hwloc_get_cache_type_depth(node->topology, clvm, -1); } else { map_depth = hwloc_get_type_depth(node->topology, hwm); } if (0 > map_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwm), node->name); return ORTE_ERR_SILENT; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_depth: %d map_depth %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bind_depth, map_depth); if (bind_depth > map_depth) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } } } return ORTE_SUCCESS; }
int pocl_topology_detect_device_info(cl_device_id device) { hwloc_topology_t pocl_topology; int ret = 0; #ifdef HWLOC_API_2 if (hwloc_get_api_version () < 0x20000) POCL_MSG_ERR ("pocl was compiled against libhwloc 2.x but is" "actually running against libhwloc 1.x \n"); #else if (hwloc_get_api_version () >= 0x20000) POCL_MSG_ERR ("pocl was compiled against libhwloc 1.x but is" "actually running against libhwloc 2.x \n"); #endif /* * hwloc's OpenCL backend causes problems at the initialization stage * because it reloads libpocl.so via the ICD loader. * * See: https://github.com/pocl/pocl/issues/261 * * The only trick to stop hwloc from initializing the OpenCL plugin * I could find is to point the plugin search path to a place where there * are no plugins to be found. */ setenv ("HWLOC_PLUGINS_PATH", "/dev/null", 1); ret = hwloc_topology_init (&pocl_topology); if (ret == -1) { POCL_MSG_ERR ("Cannot initialize the topology.\n"); return ret; } #ifdef HWLOC_API_2 hwloc_topology_set_io_types_filter(pocl_topology, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_SYSTEM, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_GROUP, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_BRIDGE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_MISC, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_NONE); hwloc_topology_set_type_filter (pocl_topology, HWLOC_OBJ_OS_DEVICE, HWLOC_TYPE_FILTER_KEEP_NONE); #else hwloc_topology_ignore_type (pocl_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_SYSTEM); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_GROUP); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_BRIDGE); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_MISC); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_PCI_DEVICE); hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_OS_DEVICE); #endif ret = hwloc_topology_load (pocl_topology); if (ret == -1) { POCL_MSG_ERR ("Cannot load the topology.\n"); goto exit_destroy; } #ifdef HWLOC_API_2 device->global_mem_size = hwloc_get_root_obj(pocl_topology)->total_memory; #else device->global_mem_size = hwloc_get_root_obj(pocl_topology)->memory.total_memory; #endif // Try to get the number of CPU cores from topology int depth = hwloc_get_type_depth(pocl_topology, HWLOC_OBJ_PU); if(depth != HWLOC_TYPE_DEPTH_UNKNOWN) device->max_compute_units = hwloc_get_nbobjs_by_depth(pocl_topology, depth); /* Find information about global memory cache by looking at the first * cache covering the first PU */ do { size_t cache_size = 0, cacheline_size = 0; hwloc_obj_t core = hwloc_get_next_obj_by_type (pocl_topology, HWLOC_OBJ_CORE, NULL); if (core) { hwloc_obj_t cache = hwloc_get_shared_cache_covering_obj (pocl_topology, core); if ((cache) && (cache->attr)) { cacheline_size = cache->attr->cache.linesize; cache_size = cache->attr->cache.size; } else core = NULL; /* fallback to L1 cache size */ } hwloc_obj_t pu = hwloc_get_next_obj_by_type (pocl_topology, HWLOC_OBJ_PU, NULL); if (!core && pu) { hwloc_obj_t cache = hwloc_get_shared_cache_covering_obj (pocl_topology, pu); if ((cache) && (cache->attr)) { cacheline_size = cache->attr->cache.linesize; cache_size = cache->attr->cache.size; } } if (!cache_size || !cacheline_size) break; device->global_mem_cache_type = 0x2; // CL_READ_WRITE_CACHE, without including all of CL/cl.h device->global_mem_cacheline_size = cacheline_size; device->global_mem_cache_size = cache_size; } while (0); // Destroy topology object and return exit_destroy: hwloc_topology_destroy (pocl_topology); return ret; }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else uint32_t i; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t pfirst,plast; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; pid_t pid = job->envtp->task_pid; cpu_bind_type_t bind_type; int verbose; hwloc_topology_t topology; #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_t cpuset,ct; #else hwloc_bitmap_t cpuset,ct; #endif hwloc_obj_t obj; struct hwloc_obj *pobj; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int hwdepth; size_t tssize; cpu_set_t ts; bind_type = job->cpu_bind_type ; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) verbose = 1 ; if (bind_type & CPU_BIND_NONE) { if (verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = HWLOC_OBJ_SOCKET; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else { if (verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); #if HWLOC_API_VERSION <= 0x00010000 cpuset = hwloc_cpuset_alloc() ; #else cpuset = hwloc_bitmap_alloc() ; #endif /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ hwloc_topology_load(topology); npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = HWLOC_OBJ_SOCKET; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & CPU_BIND_TO_LDOMS) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * Perform a block binding on the detected object respecting the * granularity. * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else { if (verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ pfirst = taskid * job->cpus_per_task ; plast = pfirst + job->cpus_per_task - 1; } else { /* sockets or ldoms granularity */ pfirst = taskid; plast = pfirst; } hwdepth = hwloc_get_type_depth(topology,hwtype); for (i = pfirst; i <= plast && i < nobj ; i++) { obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i); /* if requested binding overlap the granularity */ /* use the ancestor cpuset instead of the object one */ if (hwloc_compare_types(hwtype,req_hwtype) > 0) { /* Get the parent object of req_hwtype or the */ /* one just above if not found (meaning of >0)*/ /* (useful for ldoms binding with !NUMA nodes)*/ pobj = obj->parent; while (pobj != NULL && hwloc_compare_types(pobj->type, req_hwtype) > 0) pobj = pobj->parent; if (pobj != NULL) { if (verbose) info("task/cgroup: task[%u] " "higher level %s found", taskid, hwloc_obj_type_string( pobj->type)); #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(pobj-> allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(pobj-> allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } else { /* should not be executed */ if (verbose) info("task/cgroup: task[%u] " "no higher level found", taskid); #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(obj-> allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(obj-> allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } } else { #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(obj->allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(obj->allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } } char *str; #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_asprintf(&str,cpuset); #else hwloc_bitmap_asprintf(&str,cpuset); #endif tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if (sched_setaffinity(pid,tssize,&ts)) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_free(cpuset); #else hwloc_bitmap_free(cpuset); #endif hwloc_topology_destroy(topology); return fstatus; #endif }
static int bind_in_place(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node downwards * until we find an unused object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind in place for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(map->binding) || !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* get the index of this location */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)proc->locale->userdata; data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u", ORTE_NAME_PRINT(&proc->name), hwloc_obj_type_string(proc->locale->type), idx); /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); /* record the location */ proc->bind_location = proc->locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(proc->locale->type), idx, node->name); } } return ORTE_SUCCESS; }
static int bind_in_place(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node downwards * until we find an unused object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; hwloc_obj_t locale, sib; char *cpu_bitmap; bool found; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind in place for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(map->binding) || !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* bozo check */ if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* get the index of this location */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } data = (opal_hwloc_obj_data_t*)locale->userdata; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* if we don't have enough cpus to support this additional proc, try * shifting the location to a cousin that can support it - the important * thing is that we maintain the same level in the topology */ if (ncpus < (data->num_bound+1)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_in_place: searching right", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; found = false; while (NULL != (sib = sib->next_cousin)) { data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology, sib); if (data->num_bound < ncpus) { found = true; locale = sib; break; } } if (!found) { /* try the other direction */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_in_place: searching left", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; while (NULL != (sib = sib->prev_cousin)) { data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology, sib); if (data->num_bound < ncpus) { found = true; locale = sib; break; } } } if (!found) { /* no place to put this - see if overload is allowed */ if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* if the user specified a binding policy, then we cannot meet * it since overload isn't allowed, so error out - have the * message indicate that setting overload allowed will remove * this restriction */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } else { /* if we have the default binding policy, then just don't bind */ OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); unbind_procs(jdata); return ORTE_SUCCESS; } } } } /* track the number bound */ data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u", ORTE_NAME_PRINT(&proc->name), hwloc_obj_type_string(locale->type), idx); /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus); orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); /* update the location, in case it changed */ orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), cpu_bitmap, hwloc_obj_type_string(locale->type), idx, node->name); if (NULL != cpu_bitmap) { free(cpu_bitmap); } } } return ORTE_SUCCESS; }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else hwloc_obj_type_t socket_or_node; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; pid_t pid = job->envtp->task_pid; cpu_bind_type_t bind_type; int bind_verbose = 0; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; size_t tssize; cpu_set_t ts; bind_type = job->cpu_bind_type ; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) bind_verbose = 1 ; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); cpuset = hwloc_bitmap_alloc(); hwloc_topology_load(topology); if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & CPU_BIND_TO_LDOMS) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else { char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } /* There are two "distributions," controlled by the * -m option of srun and friends. The first is the * distribution of tasks to nodes. The second is the * distribution of allocated cpus to tasks for * binding. This code is handling the second * distribution. Here's how the values get set, based * on the value of -m * * SLURM_DIST_CYCLIC = srun -m cyclic * SLURM_DIST_BLOCK = srun -m block * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic * * In the first two cases, the user only specified the * first distribution. The second distribution * defaults to cyclic. In the second two cases, the * user explicitly requested a second distribution of * cyclic. So all these four cases correspond to a * second distribution of cyclic. So we want to call * _task_cgroup_cpuset_dist_cyclic. * * If the user explicitly specifies a second * distribution of block, or if * CR_CORE_DEFAULT_DIST_BLOCK is configured and the * user does not explicitly specify a second * distribution of cyclic, the second distribution is * block, and we need to call * _task_cgroup_cpuset_dist_block. In these cases, * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK * or SLURM_DIST_BLOCK_BLOCK. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 384. */ switch (job->task_dist) { case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: _task_cgroup_cpuset_dist_cyclic( topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; default: _task_cgroup_cpuset_dist_block( topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if (sched_setaffinity(pid,tssize,&ts)) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
static int _task_cgroup_cpuset_dist_block( hwloc_topology_t topology, hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype, uint32_t nobj, stepd_step_rec_t *job, int bind_verbose, hwloc_bitmap_t cpuset) { hwloc_obj_t obj; uint32_t core_loop, ntskip, npdist; uint32_t i, j, pfirst, plast; uint32_t taskid = job->envtp->localid; int hwdepth; uint32_t npus, ncores, nsockets; int spec_thread_cnt = 0; bitstr_t *spec_threads = NULL; uint32_t *thread_idx; uint32_t core_idx; bool core_fcyclic, core_block; nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); core_block = (job->task_dist & SLURM_DIST_COREMASK) == SLURM_DIST_COREBLOCK ? true : false; core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) == SLURM_DIST_CORECFULL ? true : false; thread_idx = xmalloc(ncores * sizeof(uint32_t)); if (bind_verbose) { info("task/cgroup: task[%u] using block distribution, " "task_dist 0x%x", taskid, job->task_dist); } if ((hwloc_compare_types(hwtype, HWLOC_OBJ_PU) == 0) && !core_block) { thread_idx = xmalloc(ncores * sizeof(uint32_t)); ntskip = taskid; npdist = job->cpus_per_task; i = 0; j = 0; core_idx = 0; core_loop = 0; while (i < ntskip + 1 && core_loop < npdist + 1) { while ((core_idx < ncores) && (j < npdist)) { obj = hwloc_get_obj_below_by_type( topology, HWLOC_OBJ_CORE, core_idx, hwtype, thread_idx[core_idx]); if (obj != NULL) { thread_idx[core_idx]++; j++; if (i == ntskip) _add_hwloc_cpuset(hwtype, req_hwtype, obj, taskid, bind_verbose, cpuset); if ((j < npdist) && core_fcyclic) core_idx++; } else { core_idx++; } } if (j == npdist) { i++; j = 0; core_idx++; // no validity check, handled by the while core_loop = 0; } else { core_loop++; core_idx = 0; } } xfree(thread_idx); /* should never happen in normal scenario */ if (core_loop > npdist) { error("task/cgroup: task[%u] infinite loop broken while " "trying to provision compute elements using %s", taskid, format_task_dist_states(job->task_dist)); return XCGROUP_ERROR; } else return XCGROUP_SUCCESS; } if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ pfirst = taskid * job->cpus_per_task ; plast = pfirst + job->cpus_per_task - 1; } else { /* sockets or ldoms granularity */ pfirst = taskid; plast = pfirst; } hwdepth = hwloc_get_type_depth(topology, hwtype); if ((job->job_core_spec != (uint16_t) NO_VAL) && (job->job_core_spec & CORE_SPEC_THREAD) && (job->job_core_spec != CORE_SPEC_THREAD) && (nsockets != 0)) { /* Skip specialized threads as needed */ int i, t, c, s; int cores = MAX(1, (ncores / nsockets)); int threads = npus / cores; spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD); spec_threads = bit_alloc(npus); for (t = threads - 1; ((t >= 0) && (spec_thread_cnt > 0)); t--) { for (c = cores - 1; ((c >= 0) && (spec_thread_cnt > 0)); c--) { for (s = nsockets - 1; ((s >= 0) && (spec_thread_cnt > 0)); s--) { i = s * cores + c; i = (i * threads) + t; bit_set(spec_threads, i); spec_thread_cnt--; } } } if (hwtype == HWLOC_OBJ_PU) { for (i = 0; i <= pfirst && i < npus; i++) { if (bit_test(spec_threads, i)) pfirst++; }; } } for (i = pfirst; i <= plast && i < nobj ; i++) { obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i); _add_hwloc_cpuset(hwtype, req_hwtype, obj, taskid, bind_verbose, cpuset); } if (spec_threads) { for (i = 0; i < npus; i++) { if (bit_test(spec_threads, i)) { hwloc_bitmap_clr(cpuset, i); } }; FREE_NULL_BITMAP(spec_threads); } return XCGROUP_SUCCESS; }
int main(void) { int depth; unsigned i, n; unsigned long size; int levels; char string[128]; int topodepth; void *m; hwloc_topology_t topology; hwloc_cpuset_t cpuset; hwloc_obj_t obj; /* Allocate and initialize topology object. */ hwloc_topology_init(&topology); /* ... Optionally, put detection configuration here to ignore some objects types, define a synthetic topology, etc.... The default is to detect all the objects of the machine that the caller is allowed to access. See Configure Topology Detection. */ /* Perform the topology detection. */ hwloc_topology_load(topology); /* Optionally, get some additional topology information in case we need the topology depth later. */ topodepth = hwloc_topology_get_depth(topology); /***************************************************************** * First example: * Walk the topology with an array style, from level 0 (always * the system level) to the lowest level (always the proc level). *****************************************************************/ for (depth = 0; depth < topodepth; depth++) { printf("*** Objects at level %d\n", depth); for (i = 0; i < hwloc_get_nbobjs_by_depth(topology, depth); i++) { hwloc_obj_type_snprintf(string, sizeof(string), hwloc_get_obj_by_depth(topology, depth, i), 0); printf("Index %u: %s\n", i, string); } } /***************************************************************** * Second example: * Walk the topology with a tree style. *****************************************************************/ printf("*** Printing overall tree\n"); print_children(topology, hwloc_get_root_obj(topology), 0); /***************************************************************** * Third example: * Print the number of packages. *****************************************************************/ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PACKAGE); if (depth == HWLOC_TYPE_DEPTH_UNKNOWN) { printf("*** The number of packages is unknown\n"); } else { printf("*** %u package(s)\n", hwloc_get_nbobjs_by_depth(topology, depth)); } /***************************************************************** * Fourth example: * Compute the amount of cache that the first logical processor * has above it. *****************************************************************/ levels = 0; size = 0; for (obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, 0); obj; obj = obj->parent) if (obj->type == HWLOC_OBJ_CACHE) { levels++; size += obj->attr->cache.size; } printf("*** Logical processor 0 has %d caches totaling %luKB\n", levels, size / 1024); /***************************************************************** * Fifth example: * Bind to only one thread of the last core of the machine. * * First find out where cores are, or else smaller sets of CPUs if * the OS doesn't have the notion of a "core". *****************************************************************/ depth = hwloc_get_type_or_below_depth(topology, HWLOC_OBJ_CORE); /* Get last core. */ obj = hwloc_get_obj_by_depth(topology, depth, hwloc_get_nbobjs_by_depth(topology, depth) - 1); if (obj) { /* Get a copy of its cpuset that we may modify. */ cpuset = hwloc_bitmap_dup(obj->cpuset); /* Get only one logical processor (in case the core is SMT/hyper-threaded). */ hwloc_bitmap_singlify(cpuset); /* And try to bind ourself there. */ if (hwloc_set_cpubind(topology, cpuset, 0)) { char *str; int error = errno; hwloc_bitmap_asprintf(&str, obj->cpuset); printf("Couldn't bind to cpuset %s: %s\n", str, strerror(error)); free(str); } /* Free our cpuset copy */ hwloc_bitmap_free(cpuset); } /***************************************************************** * Sixth example: * Allocate some memory on the last NUMA node, bind some existing * memory to the last NUMA node. *****************************************************************/ /* Get last node. There's always at least one. */ n = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NUMANODE); obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, n - 1); size = 1024*1024; m = hwloc_alloc_membind_nodeset(topology, size, obj->nodeset, HWLOC_MEMBIND_BIND, 0); hwloc_free(topology, m, size); m = malloc(size); hwloc_set_area_membind_nodeset(topology, m, size, obj->nodeset, HWLOC_MEMBIND_BIND, 0); free(m); /* Destroy topology object. */ hwloc_topology_destroy(topology); return 0; }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else char mstr[1 + CPU_SETSIZE / 4]; cpu_bind_type_t bind_type; cpu_set_t ts; hwloc_obj_t obj; hwloc_obj_type_t socket_or_node; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int bind_verbose = 0; int rc = SLURM_SUCCESS, match; pid_t pid = job->envtp->task_pid; size_t tssize; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); hwloc_topology_load(topology); cpuset = hwloc_bitmap_alloc(); int spec_threads = 0; if (job->batch) { jnpus = job->cpus; job->cpus_per_task = job->cpus; } else jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if ((conf->task_plugin_param & CPU_BIND_VERBOSE) || (bind_type & CPU_BIND_VERBOSE)) bind_verbose = 1 ; if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else if (bind_type & CPU_BIND_TO_BOARDS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "board level binding",taskid); req_hwtype = HWLOC_OBJ_GROUP; } else if (bind_type & bind_mode_ldom) { req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); //info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if ((job->job_core_spec != (uint16_t) NO_VAL) && (job->job_core_spec & CORE_SPEC_THREAD) && (job->job_core_spec != CORE_SPEC_THREAD)) { spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD); } if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid, hwloc_obj_type_string(hwtype)); } else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) && (nobj < jnpus)) { info("task/cgroup: task[%u] not enough %s objects (%d < %d), " "disabling affinity", taskid, hwloc_obj_type_string(hwtype), nobj, jnpus); } else if (bind_type & bind_mode) { /* Explicit binding mode specified by the user * Bind the taskid in accordance with the specified mode */ obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0); match = hwloc_bitmap_isequal(obj->complete_cpuset, obj->allowed_cpuset); if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) { info("task/cgroup: entire node must be allocated, " "disabling affinity, task[%u]", taskid); fprintf(stderr, "Requested cpu_bind option requires " "entire node to be allocated; disabling " "affinity\n"); } else { if (bind_verbose) { info("task/cgroup: task[%u] is requesting " "explicit binding mode", taskid); } _get_sched_cpuset(topology, hwtype, req_hwtype, &ts, job); tssize = sizeof(cpu_set_t); fstatus = SLURM_SUCCESS; if (job->job_core_spec != (uint16_t) NO_VAL) _validate_mask(taskid, obj, &ts); if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); error("sched_setaffinity rc = %d", rc); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); } _slurm_chkaffinity(&ts, job, rc); } } else { /* Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). */ char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity dist %u", taskid, hwloc_obj_type_string(hwtype), job->task_dist); } /* See srun man page for detailed information on --distribution * option. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 368 */ switch (job->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: _task_cgroup_cpuset_dist_cyclic(topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, &ts, tssize) == 0) { fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "taskset '%s'", taskid, str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] set taskset '%s'", taskid, str); } _slurm_chkaffinity(&ts, job, rc); } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
//Initializes HWLOC and load the machine architecture int hw_topology_init (struct arch_topology *topo) { hwloc_obj_t obj, core1, core2; int count, i, j, error; //Create the machine representation error = hwloc_topology_init(&topology); //Go throught the topology only if HWLOC is //successifully initialized if(!error) { hwloc_topology_load(topology); local_topo = malloc(sizeof(struct arch_topology)); #if defined (__DBCSR_ACC) || defined (__PW_CUDA) int nDev; ma_get_ndevices_cu(&nDev); #endif //Extract number of NUMA nodes if (hwloc_get_type_depth (topology, HWLOC_OBJ_NODE)) topo->nnodes = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_NODE)); else topo->nnodes = 0; //Get number of cores, sockets and processing units topo->ncores = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_CORE)); topo->nsockets = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_SOCKET)); topo->npus = hwloc_get_nbobjs_by_depth (topology, hwloc_get_type_depth (topology, HWLOC_OBJ_PU)); //Compute number of memory controlers per socket //basically the number of NUMA nodes per socket if (topo->nnodes > topo->nsockets) topo->nmemcontroller = topo->nnodes/topo->nsockets; else topo->nmemcontroller = 1; count = 0; topo->nshared_caches = 0; //Get derivate information - get number of cache per PU for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,0); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE) { if (obj->arity>1) topo->nshared_caches++; else { count++; topo->ncaches = count; } } } //Number of direct siblings //Siblings cores are the ones that share at least one component //level of the architecture count = 0; core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, 0); core2 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, 1); obj = hwloc_get_common_ancestor_obj(topology, core1, core2); if (obj) topo->nsiblings = obj->arity; //Machine node and core representation machine_nodes = (struct node*) malloc (topo->nnodes*sizeof(struct node)); machine_cores = (struct core*) malloc (topo->ncores*sizeof(struct core)); phys_cpus = malloc (topo->ncores*sizeof(int)); get_phys_id(topology, topo->ncores, 0); //Get the caches sizes and other information for each core for (i = 0; i < topo->ncores ; i++) { machine_cores[i].caches = malloc (topo->ncaches*sizeof(size_t)); machine_cores[i].shared_caches = malloc (topo->ncaches*sizeof(int)); for (j = 0; j < topo->ncaches; j++) machine_cores[i].shared_caches[j] = 0; for (j = topo->ncaches ; j > topo->ncaches - topo->nshared_caches; j--) machine_cores[i].shared_caches[j-1] = 1; machine_cores[i].nsiblings = topo->nsiblings; machine_cores[i].siblings_id = malloc (topo->nsiblings*sizeof(unsigned)); if(topo->ncores == topo->npus){ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i); machine_cores[i].id = core1->os_index; count = 0; for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,i); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE){ machine_cores[i].caches[count] = obj->attr->cache.size / 1024; count++; } if (obj->type == HWLOC_OBJ_NODE) machine_cores[i].numaNode = obj->logical_index; } } else{ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i); machine_cores[i].id = core1->os_index; count = 0; for(obj = hwloc_get_obj_by_type(topology,HWLOC_OBJ_CORE,i); obj; obj = obj->parent) { if (obj->type == HWLOC_OBJ_CACHE) { machine_cores[i].caches[count] = obj->attr->cache.size / 1024; count++; } if (obj->type == HWLOC_OBJ_NODE) machine_cores[i].numaNode = obj->logical_index; } } } //Get siblings id - so each core knows its siblings for (i = 0; i < topo->ncores ; i++) { if(topo->ncores == topo->npus){ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_PU, i); set_phys_siblings(i,machine_cores[i].id,core1,topo->ncores,topo->nsiblings,HWLOC_OBJ_PU); } else{ core1 = hwloc_get_obj_by_type(topology, HWLOC_OBJ_CORE, i); set_phys_siblings(i,machine_cores[i].id,core1,topo->ncores,topo->nsiblings,HWLOC_OBJ_CORE); } } int ncore_node = topo->ncores/topo->nnodes; int count_cores; //Get the information for each NUMAnode for (i = 0; i < topo->nnodes ; i++) { obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NODE, i); machine_nodes[i].id = obj->os_index; machine_nodes[i].memory = obj->memory.total_memory; machine_nodes[i].ncores = ncore_node; machine_nodes[i].mycores = malloc (ncore_node*sizeof(unsigned)); //Get the cores id of each NUMAnode count_cores = 0; set_node_cores(topology, obj, i, &count_cores); //GPU support #if defined (__DBCSR_ACC) || defined (__PW_CUDA) int *devIds; devIds = malloc (nDev*sizeof(int)); topo->ngpus = nDev; ma_get_cu(i,devIds); machine_nodes[i].mygpus = devIds; #endif } //counting network cards count = 0; hwloc_topology_t topo_net; error = hwloc_topology_init(&topo_net); hwloc_topology_set_flags(topo_net, HWLOC_TOPOLOGY_FLAG_IO_DEVICES); if (!error){ hwloc_topology_load(topo_net); for (obj = hwloc_get_obj_by_type(topo_net, HWLOC_OBJ_OS_DEVICE, 0); obj; obj = hwloc_get_next_osdev(topo_net,obj)) if (obj->attr->osdev.type == HWLOC_OBJ_OSDEV_NETWORK || obj->attr->osdev.type == HWLOC_OBJ_OSDEV_OPENFABRICS) count++; topo->nnetcards = count; } else //if can not load I/O devices topo->nnetcards = 0; hwloc_topology_destroy(topo_net); /*Local copy of the machine topology components*/ local_topo->nnodes = topo->nnodes; local_topo->nsockets = topo->nsockets; local_topo->ncores = topo->ncores; local_topo->npus = topo->npus; local_topo->ngpus = topo->ngpus; local_topo->ncaches = topo->ncaches; local_topo->nshared_caches = topo->nshared_caches; local_topo->nsiblings = topo->nsiblings; local_topo->nmemcontroller = topo->nmemcontroller; local_topo->nnetcards = topo->nnetcards; } return error; }
extern int get_cpuinfo(uint16_t *p_cpus, uint16_t *p_boards, uint16_t *p_sockets, uint16_t *p_cores, uint16_t *p_threads, uint16_t *p_block_map_size, uint16_t **p_block_map, uint16_t **p_block_map_inv) { enum { SOCKET=0, CORE=1, PU=2, LAST_OBJ=3 }; hwloc_topology_t topology; hwloc_obj_t obj; hwloc_obj_type_t objtype[LAST_OBJ]; unsigned idx[LAST_OBJ]; int nobj[LAST_OBJ]; int actual_cpus; int macid; int absid; int actual_boards = 1, depth; int i; debug2("hwloc_topology_init"); if (hwloc_topology_init(&topology)) { /* error in initialize hwloc library */ debug("hwloc_topology_init() failed."); return 1; } /* parse all system */ hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM); /* ignores cache, misc */ hwloc_topology_ignore_type (topology, HWLOC_OBJ_CACHE); hwloc_topology_ignore_type (topology, HWLOC_OBJ_MISC); /* load topology */ debug2("hwloc_topology_load"); if (hwloc_topology_load(topology)) { /* error in load hardware topology */ debug("hwloc_topology_load() failed."); hwloc_topology_destroy(topology); return 2; } /* Some processors (e.g. AMD Opteron 6000 series) contain multiple * NUMA nodes per socket. This is a configuration which does not map * into the hardware entities that Slurm optimizes resource allocation * for (PU/thread, core, socket, baseboard, node and network switch). * In order to optimize resource allocations on such hardware, Slurm * will consider each NUMA node within the socket as a separate socket. * You can disable this configuring "SchedulerParameters=Ignore_NUMA", * in which case Slurm will report the correct socket count on the node, * but not be able to optimize resource allocations on the NUMA nodes. */ objtype[SOCKET] = HWLOC_OBJ_SOCKET; objtype[CORE] = HWLOC_OBJ_CORE; objtype[PU] = HWLOC_OBJ_PU; if (hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET)) { char *sched_params = slurm_get_sched_params(); if (sched_params && strcasestr(sched_params, "Ignore_NUMA")) { info("Ignoring NUMA nodes within a socket"); } else { info("Considering each NUMA node as a socket"); objtype[SOCKET] = HWLOC_OBJ_NODE; } xfree(sched_params); } /* number of objects */ depth = hwloc_get_type_depth(topology, HWLOC_OBJ_GROUP); if (depth != HWLOC_TYPE_DEPTH_UNKNOWN) { actual_boards = MAX(hwloc_get_nbobjs_by_depth(topology, depth), 1); } nobj[SOCKET] = hwloc_get_nbobjs_by_type(topology, objtype[SOCKET]); nobj[CORE] = hwloc_get_nbobjs_by_type(topology, objtype[CORE]); /* * Workaround for hwloc * hwloc_get_nbobjs_by_type() returns 0 on some architectures. */ if ( nobj[SOCKET] == 0 ) { debug("get_cpuinfo() fudging nobj[SOCKET] from 0 to 1"); nobj[SOCKET] = 1; } if ( nobj[CORE] == 0 ) { debug("get_cpuinfo() fudging nobj[CORE] from 0 to 1"); nobj[CORE] = 1; } if ( nobj[SOCKET] == -1 ) fatal("get_cpuinfo() can not handle nobj[SOCKET] = -1"); if ( nobj[CORE] == -1 ) fatal("get_cpuinfo() can not handle nobj[CORE] = -1"); actual_cpus = hwloc_get_nbobjs_by_type(topology, objtype[PU]); #if 0 /* Used to find workaround above */ info("CORE = %d SOCKET = %d actual_cpus = %d nobj[CORE] = %d", CORE, SOCKET, actual_cpus, nobj[CORE]); #endif nobj[PU] = actual_cpus/nobj[CORE]; /* threads per core */ nobj[CORE] /= nobj[SOCKET]; /* cores per socket */ debug("CPUs:%d Boards:%u Sockets:%d CoresPerSocket:%d ThreadsPerCore:%d", actual_cpus, actual_boards, nobj[SOCKET], nobj[CORE], nobj[PU]); /* allocate block_map */ *p_block_map_size = (uint16_t)actual_cpus; if (p_block_map && p_block_map_inv) { *p_block_map = xmalloc(actual_cpus * sizeof(uint16_t)); *p_block_map_inv = xmalloc(actual_cpus * sizeof(uint16_t)); /* initialize default as linear mapping */ for (i = 0; i < actual_cpus; i++) { (*p_block_map)[i] = i; (*p_block_map_inv)[i] = i; } /* create map with hwloc */ for (idx[SOCKET]=0; idx[SOCKET]<nobj[SOCKET]; ++idx[SOCKET]) { for (idx[CORE]=0; idx[CORE]<nobj[CORE]; ++idx[CORE]) { for (idx[PU]=0; idx[PU]<nobj[PU]; ++idx[PU]) { /* get hwloc_obj by indexes */ obj=hwloc_get_obj_below_array_by_type( topology, 3, objtype, idx); if (!obj) continue; macid = obj->os_index; absid = idx[SOCKET]*nobj[CORE]*nobj[PU] + idx[CORE]*nobj[PU] + idx[PU]; if ((macid >= actual_cpus) || (absid >= actual_cpus)) { /* physical or logical ID are * out of range */ continue; } debug4("CPU map[%d]=>%d", absid, macid); (*p_block_map)[absid] = macid; (*p_block_map_inv)[macid] = absid; } } } } hwloc_topology_destroy(topology); /* update output parameters */ *p_cpus = actual_cpus; *p_boards = actual_boards; *p_sockets = nobj[SOCKET]; *p_cores = nobj[CORE]; *p_threads = nobj[PU]; #if DEBUG_DETAIL /*** Display raw data ***/ debug("CPUs:%u Boards:%u Sockets:%u CoresPerSocket:%u ThreadsPerCore:%u", *p_cpus, *p_boards, *p_sockets, *p_cores, *p_threads); /* Display the mapping tables */ if (p_block_map && p_block_map_inv) { debug("------"); debug("Abstract -> Machine logical CPU ID block mapping:"); debug("AbstractId PhysicalId Inverse"); for (i = 0; i < *p_cpus; i++) { debug3(" %4d %4u %4u", i, (*p_block_map)[i], (*p_block_map_inv)[i]); } debug("------"); } #endif return 0; }