void darts :: hwloc :: AbstractMachine :: discoverTopologyWithLLC(void) { unsigned nbSockets = hwloc_get_nbobjs_by_type(_topology,HWLOC_OBJ_SOCKET); hwloc_obj_t o = hwloc_get_obj_by_type(_topology,HWLOC_OBJ_SOCKET,0); hwloc_obj_t obj; for (obj = o->first_child; obj && obj->type != HWLOC_OBJ_CACHE; obj = obj->first_child) ; _nbClusters = nbSockets; if (obj) { int n = hwloc_get_nbobjs_inside_cpuset_by_type(_topology,obj->cpuset,HWLOC_OBJ_PU); _nbClusters = _nbTotalUnits / n; // XXX assumes homogeneous distribution of PUs } _clusterMap = new Cluster[_nbClusters]; // TODO Refactor this code and the next function's code into a single one for (o = obj; o; o = o->next_cousin) { int nUnits = hwloc_get_nbobjs_inside_cpuset_by_type(_topology,o->cpuset,HWLOC_OBJ_PU); Unit *units = new Unit[nUnits]; for (int i = 0; i < nUnits; ++i) { hwloc_obj_t t = hwloc_get_obj_inside_cpuset_by_type(_topology,o->cpuset,HWLOC_OBJ_PU,i); Unit hwu(o->logical_index,t->logical_index,t->os_index); units[i] = hwu; // simple shallow copy } Cluster cluster(o->logical_index,o->logical_index,nUnits,units); _clusterMap[o->logical_index] = cluster; // simple shallow copy } }
void qrm_hwloc_topo(int *nodes, int *topo) { int depth, ret; unsigned i, n, j, ncores, nnodes, cnode; int topodepth, numa; hwloc_topology_t topology; hwloc_cpuset_t cpuset; hwloc_obj_t obj, cobj; hwloc_obj_type_t otype; hwloc_topology_init(&topology); hwloc_topology_load(topology); /* get the number os cores and NUMA nodes */ ncores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); printf("ncores: %d\n",ncores); nnodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); if(nnodes == 0){ otype = HWLOC_OBJ_SOCKET; printf("grouping with sockets\n"); nnodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); } else { otype = HWLOC_OBJ_NODE; printf("grouping with NUMA nodes\n"); } /* get the handle for the first NUMA node */ obj = hwloc_get_obj_by_type(topology, otype, 0); /* get the number of cores in one NUMA node (supposedly the same for all nodes) */ cnode = hwloc_get_nbobjs_inside_cpuset_by_type(topology, obj->cpuset, HWLOC_OBJ_CORE); for(i=0; i<nnodes; i++){ /* get the handle for the first i-th node */ obj = hwloc_get_obj_by_type(topology, otype, i); /* get the number of cores in i-th NUMA node (supposedly the same for all nodes) */ cnode = hwloc_get_nbobjs_inside_cpuset_by_type(topology, obj->cpuset, HWLOC_OBJ_CORE); /* get the first core in this node */ cobj = hwloc_get_next_obj_inside_cpuset_by_type(topology, obj->cpuset, HWLOC_OBJ_CORE, NULL); topo[(i*cnode)] = cobj->logical_index; /* printf("%2d -- group: %2d",i,cobj->logical_index); */ for(j=1; j<cnode; j++){ cobj = hwloc_get_next_obj_inside_cpuset_by_type(topology, obj->cpuset, HWLOC_OBJ_CORE, cobj); topo[(i*cnode)+j] = cobj->logical_index; /* printf(" %2d",cobj->logical_index); */ } /* printf("\n"); */ } hwloc_topology_destroy(topology); return; }
void darts :: hwloc :: AbstractMachine :: discoverTopology(void) { _nbClusters = hwloc_get_nbobjs_by_type(_topology,HWLOC_OBJ_SOCKET); _clusterMap = new Cluster[_nbClusters]; hwloc_obj_t o = hwloc_get_obj_by_type(_topology,HWLOC_OBJ_SOCKET,0); // TODO Refactor this code and the previous function's code into a single one for (; o; o = o->next_cousin) { int nUnits = hwloc_get_nbobjs_inside_cpuset_by_type(_topology,o->cpuset,HWLOC_OBJ_PU); Unit *units = new Unit[nUnits]; for (int i = 0; i < nUnits; ++i) { hwloc_obj_t t = hwloc_get_obj_inside_cpuset_by_type(_topology,o->cpuset,HWLOC_OBJ_PU,i); Unit hwu(o->logical_index,t->logical_index,t->os_index); units[i] = hwu; // simple shallow copy } Cluster cluster(o->logical_index,o->logical_index,nUnits,units); _clusterMap[o->logical_index] = cluster; // simple shallow copy } }
/* This function initializes the bind_map data structure with the binding information from all * the ranks that have copied their cpu-affinity information into the shared memory region. * */ void MPIDI_SHM_hwloc_init_bindmap(int num_ranks, int topo_depth, int *shared_region, int **bind_map) { int i, level; unsigned int num_obj, curr_obj; hwloc_cpuset_t hwloc_cpuset = hwloc_bitmap_alloc(); /* STEP 3.1. Collect the binding information from hwloc for all ranks */ for (i = 0; i < num_ranks; ++i) { cpu_set_t t = ((cpu_set_t *) (shared_region))[i]; hwloc_cpuset_from_glibc_sched_affinity(MPIR_Process.hwloc_topology, hwloc_cpuset, &t, sizeof(t)); /* HWLOC_OBJ_PU is the smallest unit of computation. We would like to get all the * affinity information for each rank */ num_obj = hwloc_get_nbobjs_inside_cpuset_by_type(MPIR_Process.hwloc_topology, hwloc_cpuset, HWLOC_OBJ_PU); /* Go over all objects, and if it is bound to more than one PU at that level, set it * to -1, otherwise update to the binding*/ for (curr_obj = 0; curr_obj < num_obj; ++curr_obj) { hwloc_obj_t obj = hwloc_get_obj_inside_cpuset_by_type(MPIR_Process.hwloc_topology, hwloc_cpuset, HWLOC_OBJ_PU, curr_obj); level = 0; do { /* If binding was not set, or is same as previous binding, update. * Note that we use logical indices from hwloc instead of physical indices * because logical indices are more portable - see hwloc documentation*/ if (bind_map[i][level] == 0 || bind_map[i][level] == obj->logical_index) { bind_map[i][level] = obj->logical_index; } else { /* If rank is bound to different PUs at that level, we set to -1 */ bind_map[i][level] = -1; } level++; } while ((obj = obj->parent)); } } hwloc_bitmap_free(hwloc_cpuset); }
void Hwloc::getNumSockets(unsigned int &allowedNodes, int &numSockets, unsigned int &hwThreads) { #ifdef HWLOC numSockets = 0; // Nodes that can be seen by hwloc allowedNodes = 0; // Hardware threads hwThreads = 0; int depth = hwloc_get_type_depth( _hwlocTopology, HWLOC_OBJ_NODE ); // If there are NUMA nodes in this machine if ( depth != HWLOC_TYPE_DEPTH_UNKNOWN ) { //hwloc_const_cpuset_t cpuset = hwloc_topology_get_online_cpuset( _hwlocTopology ); //allowedNodes = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, cpuset, HWLOC_OBJ_NODE ); //hwThreads = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, cpuset, HWLOC_OBJ_PU ); unsigned nodes = hwloc_get_nbobjs_by_depth( _hwlocTopology, depth ); //hwloc_cpuset_t set = i // For each node, count how many hardware threads there are below. for ( unsigned nodeIdx = 0; nodeIdx < nodes; ++nodeIdx ) { hwloc_obj_t node = hwloc_get_obj_by_depth( _hwlocTopology, depth, nodeIdx ); int localThreads = hwloc_get_nbobjs_inside_cpuset_by_type( _hwlocTopology, node->cpuset, HWLOC_OBJ_PU ); // Increase hw thread count hwThreads += localThreads; // If this node has hw threads beneath, increase the number of viewable nodes if ( localThreads > 0 ) ++allowedNodes; } numSockets = nodes; } // Otherwise, set it to 1 else { allowedNodes = 1; numSockets = 1; } #else numSockets = 0; allowedNodes = 0; #endif }
void qrm_hwloc_info(int *ncores, int *nnodes, int *cnode) { int depth, ret; unsigned i, n, j; int topodepth, numa; hwloc_topology_t topology; hwloc_cpuset_t cpuset; hwloc_obj_t obj, cobj; hwloc_obj_type_t otype; hwloc_topology_init(&topology); hwloc_topology_load(topology); /* get the number os cores and NUMA nodes */ *ncores = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); /* printf("ncores: %d\n",*ncores); */ *nnodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); if(*nnodes == 0){ otype = HWLOC_OBJ_SOCKET; /* printf("grouping with sockets\n"); */ *nnodes = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); } else { otype = HWLOC_OBJ_NODE; /* printf("grouping with NUMA nodes\n"); */ } /* get the handle for the first NUMA node */ obj = hwloc_get_obj_by_type(topology, otype, 0); /* get the number of cores in one NUMA node (supposedly the same for all nodes) */ *cnode = hwloc_get_nbobjs_inside_cpuset_by_type(topology, obj->cpuset, HWLOC_OBJ_CORE); hwloc_topology_destroy(topology); return; }
/* * Prettyprint a list of all available sockets and cores. Note that * this is *everything* -- not just the ones that are available to * this process. */ static int get_rsrc_exists(char str[OMPI_AFFINITY_STRING_MAX]) { bool first = true; int i, num_cores, num_pus; char tmp[BUFSIZ]; const int stmp = sizeof(tmp) - 1; hwloc_obj_t socket, core, c2; str[0] = '\0'; for (socket = hwloc_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_SOCKET, 0); NULL != socket; socket = socket->next_cousin) { /* If this isn't the first socket, add a delimiter */ if (!first) { strncat(str, "; ", OMPI_AFFINITY_STRING_MAX - strlen(str)); } first = false; snprintf(tmp, stmp, "socket %d has ", socket->os_index); strncat(str, tmp, OMPI_AFFINITY_STRING_MAX - strlen(str)); /* Find out how many cores are inside this socket, and get an object pointing to the first core. Also count how many PUs are in the first core. */ num_cores = hwloc_get_nbobjs_inside_cpuset_by_type(opal_hwloc_topology, socket->cpuset, HWLOC_OBJ_CORE); core = hwloc_get_obj_inside_cpuset_by_type(opal_hwloc_topology, socket->cpuset, HWLOC_OBJ_CORE, 0); if (NULL != core) { num_pus = hwloc_get_nbobjs_inside_cpuset_by_type(opal_hwloc_topology, core->cpuset, HWLOC_OBJ_PU); /* Only 1 core */ if (1 == num_cores) { strncat(str, "1 core with ", OMPI_AFFINITY_STRING_MAX - strlen(str)); if (1 == num_pus) { strncat(str, "1 hwt", OMPI_AFFINITY_STRING_MAX - strlen(str)); } else { snprintf(tmp, stmp, "%d hwts", num_pus); strncat(str, tmp, OMPI_AFFINITY_STRING_MAX - strlen(str)); } } /* Multiple cores */ else { bool same = true; snprintf(tmp, stmp, "%d cores", num_cores); strncat(str, tmp, OMPI_AFFINITY_STRING_MAX - strlen(str)); /* Do all the cores have the same number of PUs? */ for (c2 = core; NULL != c2; c2 = c2->next_cousin) { if (hwloc_get_nbobjs_inside_cpuset_by_type(opal_hwloc_topology, core->cpuset, HWLOC_OBJ_PU) != num_pus) { same = false; break; } } /* Yes, they all have the same number of PUs */ if (same) { snprintf(tmp, stmp, ", each with %d hwt", num_pus); strncat(str, tmp, OMPI_AFFINITY_STRING_MAX - strlen(str)); if (num_pus != 1) { strncat(str, "s", OMPI_AFFINITY_STRING_MAX - strlen(str)); } } /* No, they have differing numbers of PUs */ else { bool first = true; strncat(str, "with (", OMPI_AFFINITY_STRING_MAX - strlen(str)); for (c2 = core; NULL != c2; c2 = c2->next_cousin) { if (!first) { strncat(str, ", ", OMPI_AFFINITY_STRING_MAX - strlen(str)); } first = false; i = hwloc_get_nbobjs_inside_cpuset_by_type(opal_hwloc_topology, core->cpuset, HWLOC_OBJ_PU); snprintf(tmp, stmp, "%d", i); strncat(str, tmp, OMPI_AFFINITY_STRING_MAX - strlen(str)); } strncat(str, ") hwts", OMPI_AFFINITY_STRING_MAX - strlen(str)); } } } } return OMPI_SUCCESS; }
hwloc::hwloc() { s_core_topology = std::pair<unsigned,unsigned>(0,0); s_core_capacity = 0 ; s_hwloc_topology = 0 ; s_hwloc_location = 0 ; s_process_binding = 0 ; for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ; hwloc_topology_init( & s_hwloc_topology ); hwloc_topology_load( s_hwloc_topology ); s_hwloc_location = hwloc_bitmap_alloc(); s_process_binding = hwloc_bitmap_alloc(); hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS ); // Choose a hwloc object type for the NUMA level, which may not exist. hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ; { // Object types to search, in order. static const hwloc_obj_type_t candidate_root_type[] = { HWLOC_OBJ_NODE /* NUMA region */ , HWLOC_OBJ_SOCKET /* hardware socket */ , HWLOC_OBJ_MACHINE /* local machine */ }; enum { CANDIDATE_ROOT_TYPE_COUNT = sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) }; for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) { if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) { root_type = candidate_root_type[k] ; } } } // Determine which of these 'root' types are available to this process. // The process may have been bound (e.g., by MPI) to a subset of these root types. // Determine current location of the master (calling) process> hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc(); hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD ); const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type ); unsigned root_base = max_root ; unsigned root_count = 0 ; unsigned core_per_root = 0 ; unsigned pu_per_core = 0 ; bool symmetric = true ; for ( unsigned i = 0 ; i < max_root ; ++i ) { const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i ); if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) { ++root_count ; // Remember which root (NUMA) object the master thread is running on. // This will be logical NUMA rank #0 for this process. if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) { root_base = i ; } // Count available cores: const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE ); unsigned core_count = 0 ; for ( unsigned j = 0 ; j < max_core ; ++j ) { const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE , j ); // If process' cpuset intersects core's cpuset then process can access this core. // Must use intersection instead of inclusion because the Intel-Phi // MPI may bind the process to only one of the core's hyperthreads. // // Assumption: if the process can access any hyperthread of the core // then it has ownership of the entire core. // This assumes that it would be performance-detrimental // to spawn more than one MPI process per core and use nested threading. if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { ++core_count ; const unsigned pu_count = hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , core->allowed_cpuset , HWLOC_OBJ_PU ); if ( pu_per_core == 0 ) pu_per_core = pu_count ; // Enforce symmetry by taking the minimum: pu_per_core = std::min( pu_per_core , pu_count ); if ( pu_count != pu_per_core ) symmetric = false ; } } if ( 0 == core_per_root ) core_per_root = core_count ; // Enforce symmetry by taking the minimum: core_per_root = std::min( core_per_root , core_count ); if ( core_count != core_per_root ) symmetric = false ; } } s_core_topology.first = root_count ; s_core_topology.second = core_per_root ; s_core_capacity = pu_per_core ; // Fill the 's_core' array for fast mapping from a core coordinate to the // hwloc cpuset object required for thread location querying and binding. for ( unsigned i = 0 ; i < max_root ; ++i ) { const unsigned root_rank = ( i + root_base ) % max_root ; const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank ); if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) { const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE ); unsigned core_count = 0 ; for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) { const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE , j ); if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ; ++core_count ; } } } } hwloc_bitmap_free( proc_cpuset_location ); if ( ! symmetric ) { std::cout << "KokkosArray::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology." << std::endl ; } }
HYD_status HYDT_topo_hwloc_init(HYDT_topo_support_level_t * support_level) { int node, sock, core, thread, idx; hwloc_obj_t obj_sys; hwloc_obj_t obj_node; hwloc_obj_t obj_sock; hwloc_obj_t obj_core; hwloc_obj_t obj_thread; struct HYDT_topo_obj *node_ptr, *sock_ptr, *core_ptr, *thread_ptr; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); hwloc_topology_init(&topology); hwloc_topology_load(topology); hwloc_initialized = 1; /* Get the max number of processing elements */ HYDT_topo_info.total_proc_units = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); /* We have qualified for basic topology support level */ *support_level = HYDT_TOPO_SUPPORT_BASIC; /* Setup the machine level */ obj_sys = hwloc_get_root_obj(topology); /* Retained for debugging purposes */ /* print_obj_info(obj_sys); */ /* init Hydra structure */ HYDT_topo_info.machine.type = HYDT_TOPO_OBJ_MACHINE; HYDT_topo_cpuset_zero(&HYDT_topo_info.machine.cpuset); HYDT_topo_info.machine.parent = NULL; HYDT_topo_info.machine.num_children = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); /* If there is no real node, consider there is one */ if (!HYDT_topo_info.machine.num_children) HYDT_topo_info.machine.num_children = 1; status = HYDT_topo_alloc_objs(HYDT_topo_info.machine.num_children, &HYDT_topo_info.machine.children); HYDU_ERR_POP(status, "error allocating topo objects\n"); /* Setup the nodes levels */ for (node = 0; node < HYDT_topo_info.machine.num_children; node++) { node_ptr = &HYDT_topo_info.machine.children[node]; node_ptr->type = HYDT_TOPO_OBJ_NODE; node_ptr->parent = &HYDT_topo_info.machine; HYDT_topo_cpuset_zero(&node_ptr->cpuset); if (!(obj_node = hwloc_get_obj_inside_cpuset_by_type(topology, obj_sys->cpuset, HWLOC_OBJ_NODE, node))) obj_node = obj_sys; /* copy the hwloc cpuset to hydra format */ hwloc_to_hydra_cpuset_dup(obj_node->cpuset, &node_ptr->cpuset); /* memory information */ node_ptr->mem.local_mem_size = obj_node->memory.local_memory; /* find the number of cache objects which match my cpuset */ node_ptr->mem.num_caches = get_cache_nbobjs(obj_sys, obj_node->cpuset); /* add the actual cache objects that match my cpuset */ if (node_ptr->mem.num_caches) { HYDU_MALLOC(node_ptr->mem.cache_size, size_t *, node_ptr->mem.num_caches * sizeof(size_t), status); HYDU_MALLOC(node_ptr->mem.cache_depth, int *, node_ptr->mem.num_caches * sizeof(int), status); idx = 0; load_cache_objs(obj_sys, obj_node->cpuset, node_ptr, &idx); } node_ptr->num_children = hwloc_get_nbobjs_inside_cpuset_by_type(topology, obj_node->cpuset, HWLOC_OBJ_SOCKET); /* In case there is no socket */ if (!node_ptr->num_children) node_ptr->num_children = 1; status = HYDT_topo_alloc_objs(node_ptr->num_children, &node_ptr->children); HYDU_ERR_POP(status, "error allocating topo objects\n"); /* Setup the socket level */ for (sock = 0; sock < node_ptr->num_children; sock++) { sock_ptr = &node_ptr->children[sock]; sock_ptr->type = HYDT_TOPO_OBJ_SOCKET; sock_ptr->parent = node_ptr; HYDT_topo_cpuset_zero(&sock_ptr->cpuset); if (!(obj_sock = hwloc_get_obj_inside_cpuset_by_type(topology, obj_node->cpuset, HWLOC_OBJ_SOCKET, sock))) obj_sock = obj_node; /* copy the hwloc cpuset to hydra format */ hwloc_to_hydra_cpuset_dup(obj_sock->cpuset, &sock_ptr->cpuset); /* memory information */ sock_ptr->mem.local_mem_size = obj_sock->memory.local_memory; /* find the number of cache objects which match my cpuset */ sock_ptr->mem.num_caches = get_cache_nbobjs(obj_sys, obj_sock->cpuset); /* add the actual cache objects that match my cpuset */ if (sock_ptr->mem.num_caches) { HYDU_MALLOC(sock_ptr->mem.cache_size, size_t *, sock_ptr->mem.num_caches * sizeof(size_t), status); HYDU_MALLOC(sock_ptr->mem.cache_depth, int *, sock_ptr->mem.num_caches * sizeof(int), status); idx = 0; load_cache_objs(obj_sys, obj_sock->cpuset, sock_ptr, &idx); } sock_ptr->num_children = hwloc_get_nbobjs_inside_cpuset_by_type(topology, obj_sock->cpuset, HWLOC_OBJ_CORE); /* In case there is no core */ if (!sock_ptr->num_children) sock_ptr->num_children = 1; status = HYDT_topo_alloc_objs(sock_ptr->num_children, &sock_ptr->children); HYDU_ERR_POP(status, "error allocating topo objects\n"); /* setup the core level */ for (core = 0; core < sock_ptr->num_children; core++) { core_ptr = &sock_ptr->children[core]; core_ptr->type = HYDT_TOPO_OBJ_CORE; core_ptr->parent = sock_ptr; HYDT_topo_cpuset_zero(&core_ptr->cpuset); if (!(obj_core = hwloc_get_obj_inside_cpuset_by_type(topology, obj_sock->cpuset, HWLOC_OBJ_CORE, core))) obj_core = obj_sock; /* copy the hwloc cpuset to hydra format */ hwloc_to_hydra_cpuset_dup(obj_core->cpuset, &core_ptr->cpuset); /* memory information */ core_ptr->mem.local_mem_size = obj_core->memory.local_memory; /* find the number of cache objects which match my cpuset */ core_ptr->mem.num_caches = get_cache_nbobjs(obj_sys, obj_core->cpuset); /* add the actual cache objects that match my cpuset */ if (core_ptr->mem.num_caches) { HYDU_MALLOC(core_ptr->mem.cache_size, size_t *, core_ptr->mem.num_caches * sizeof(size_t), status); HYDU_MALLOC(core_ptr->mem.cache_depth, int *, core_ptr->mem.num_caches * sizeof(int), status); idx = 0; load_cache_objs(obj_sys, obj_core->cpuset, core_ptr, &idx); } core_ptr->num_children = hwloc_get_nbobjs_inside_cpuset_by_type(topology, obj_core->cpuset, HWLOC_OBJ_PU); /* In case there is no thread */ if (!core_ptr->num_children) core_ptr->num_children = 1; status = HYDT_topo_alloc_objs(core_ptr->num_children, &core_ptr->children); HYDU_ERR_POP(status, "error allocating topo objects\n"); /* setup the thread level */ for (thread = 0; thread < core_ptr->num_children; thread++) { thread_ptr = &core_ptr->children[thread]; thread_ptr->type = HYDT_TOPO_OBJ_THREAD; thread_ptr->parent = core_ptr; thread_ptr->num_children = 0; thread_ptr->children = NULL; HYDT_topo_cpuset_zero(&thread_ptr->cpuset); if (!(obj_thread = hwloc_get_obj_inside_cpuset_by_type(topology, obj_core->cpuset, HWLOC_OBJ_PU, thread))) HYDU_ERR_POP(status, "unable to detect processing units\n"); /* copy the hwloc cpuset to hydra format */ hwloc_to_hydra_cpuset_dup(obj_thread->cpuset, &thread_ptr->cpuset); /* memory information */ thread_ptr->mem.local_mem_size = obj_thread->memory.local_memory; /* find the number of cache objects which match my cpuset */ thread_ptr->mem.num_caches = get_cache_nbobjs(obj_sys, obj_thread->cpuset); /* add the actual cache objects that match my cpuset */ if (thread_ptr->mem.num_caches) { HYDU_MALLOC(thread_ptr->mem.cache_size, size_t *, thread_ptr->mem.num_caches * sizeof(size_t), status); HYDU_MALLOC(thread_ptr->mem.cache_depth, int *, thread_ptr->mem.num_caches * sizeof(int), status); idx = 0; load_cache_objs(obj_sys, obj_thread->cpuset, thread_ptr, &idx); } }
int32_t Pipe::_getAutoAffinity() const { #ifdef EQ_USE_HWLOC_GL uint32_t port = getPort(); uint32_t device = getDevice(); if( port == LB_UNDEFINED_UINT32 && device == LB_UNDEFINED_UINT32 ) return lunchbox::Thread::NONE; if( port == LB_UNDEFINED_UINT32 ) port = 0; if( device == LB_UNDEFINED_UINT32 ) device = 0; hwloc_topology_t topology; hwloc_topology_init( &topology ); // Flags used for loading the I/O devices, bridges and their relevant info const unsigned long loading_flags = HWLOC_TOPOLOGY_FLAG_IO_BRIDGES | HWLOC_TOPOLOGY_FLAG_IO_DEVICES; // Set discovery flags if( hwloc_topology_set_flags( topology, loading_flags ) < 0 ) { LBINFO << "Automatic pipe thread placement failed: " << "hwloc_topology_set_flags() failed" << std::endl; hwloc_topology_destroy( topology ); return lunchbox::Thread::NONE; } if( hwloc_topology_load( topology ) < 0 ) { LBINFO << "Automatic pipe thread placement failed: " << "hwloc_topology_load() failed" << std::endl; hwloc_topology_destroy( topology ); return lunchbox::Thread::NONE; } const hwloc_obj_t osdev = hwloc_gl_get_display_osdev_by_port_device( topology, int( port ), int( device )); if( !osdev ) { LBINFO << "Automatic pipe thread placement failed: GPU not found" << std::endl; hwloc_topology_destroy( topology ); return lunchbox::Thread::NONE; } const hwloc_obj_t pcidev = osdev->parent; const hwloc_obj_t parent = hwloc_get_non_io_ancestor_obj( topology, pcidev ); const int numCpus = hwloc_get_nbobjs_inside_cpuset_by_type( topology, parent->cpuset, HWLOC_OBJ_SOCKET ); if( numCpus != 1 ) { LBINFO << "Automatic pipe thread placement failed: GPU attached to " << numCpus << " processors?" << std::endl; hwloc_topology_destroy( topology ); return lunchbox::Thread::NONE; } const hwloc_obj_t cpuObj = hwloc_get_obj_inside_cpuset_by_type( topology, parent->cpuset, HWLOC_OBJ_SOCKET, 0 ); if( cpuObj == 0 ) { LBINFO << "Automatic pipe thread placement failed: " << "hwloc_get_obj_inside_cpuset_by_type() failed" << std::endl; hwloc_topology_destroy( topology ); return lunchbox::Thread::NONE; } const int cpuIndex = cpuObj->logical_index; hwloc_topology_destroy( topology ); return cpuIndex + lunchbox::Thread::SOCKET; #else LBINFO << "Automatic thread placement not supported, no hwloc GL support" << std::endl; #endif return lunchbox::Thread::NONE; }
int main (void) { hwloc_topology_t topology; hwloc_obj_t obj, root; int err; err = hwloc_topology_init (&topology); if (err) return EXIT_FAILURE; hwloc_topology_set_synthetic (topology, "nodes:2 sockets:3 caches:4 cores:5 6"); err = hwloc_topology_load (topology); if (err) return EXIT_FAILURE; /* there is no second system object */ root = hwloc_get_root_obj (topology); obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_SYSTEM, 1); assert(!obj); /* first system object is the top-level object of the topology */ obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_MACHINE, 0); assert(obj == hwloc_get_root_obj(topology)); /* first next-object object is the top-level object of the topology */ obj = hwloc_get_next_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_MACHINE, NULL); assert(obj == hwloc_get_root_obj(topology)); /* there is no next object after the system object */ obj = hwloc_get_next_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_SYSTEM, obj); assert(!obj); /* check last PU */ obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_PU, 2*3*4*5*6-1); assert(obj == hwloc_get_obj_by_depth(topology, 5, 2*3*4*5*6-1)); /* there is no next PU after the last one */ obj = hwloc_get_next_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_PU, obj); assert(!obj); /* check there are 20 cores inside first socket */ root = hwloc_get_obj_by_depth(topology, 2, 0); assert(hwloc_get_nbobjs_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_CORE) == 20); /* check there are 12 caches inside last node */ root = hwloc_get_obj_by_depth(topology, 1, 1); assert(hwloc_get_nbobjs_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_CACHE) == 12); /* check first PU of second socket */ root = hwloc_get_obj_by_depth(topology, 2, 1); obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_PU, 0); assert(obj == hwloc_get_obj_by_depth(topology, 5, 4*5*6)); /* check third core of third socket */ root = hwloc_get_obj_by_depth(topology, 2, 2); obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_CORE, 2); assert(obj == hwloc_get_obj_by_depth(topology, 4, 2*4*5+2)); /* check first socket of second node */ root = hwloc_get_obj_by_depth(topology, 1, 1); obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_SOCKET, 0); assert(obj == hwloc_get_obj_by_depth(topology, 2, 3)); /* there is no node inside sockets */ root = hwloc_get_obj_by_depth(topology, 2, 0); obj = hwloc_get_obj_inside_cpuset_by_type(topology, root->cpuset, HWLOC_OBJ_NODE, 0); assert(!obj); hwloc_topology_destroy (topology); return EXIT_SUCCESS; }