/* recursively climb the topology, pruning procs beyond that allowed * by the given ppr */ static void prune(orte_jobid_t jobid, orte_app_idx_t app_idx, orte_node_t *node, opal_hwloc_level_t *level, orte_vpid_t *nmapped) { hwloc_obj_t obj, top; unsigned int i, nobjs; hwloc_obj_type_t lvl; unsigned cache_level = 0, k; int nprocs; hwloc_cpuset_t avail, cpus, childcpus; int n, limit, nmax, nunder, idx, idxmax = 0; orte_proc_t *proc, *pptr, *procmax; opal_hwloc_level_t ll; char dang[64]; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: pruning level %d", *level); /* convenience */ ll = *level; /* convenience */ lvl = opal_hwloc_levels[ll]; limit = ppr[ll]; if (0 == limit) { /* no limit at this level, so move up if necessary */ if (0 == ll) { /* done */ return; } --(*level); prune(jobid, app_idx, node, level, nmapped); return; } /* handle the darn cache thing again */ if (OPAL_HWLOC_L3CACHE_LEVEL == ll) { cache_level = 3; } else if (OPAL_HWLOC_L2CACHE_LEVEL == ll) { cache_level = 2; } else if (OPAL_HWLOC_L1CACHE_LEVEL == ll) { cache_level = 1; } /* get the number of resources at this level on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, lvl, cache_level, OPAL_HWLOC_AVAILABLE); /* for each resource, compute the number of procs sitting * underneath it and check against the limit */ for (i=0; i < nobjs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, lvl, cache_level, i, OPAL_HWLOC_AVAILABLE); /* get the available cpuset */ avail = opal_hwloc_base_get_available_cpus(node->topology, obj); /* look at the intersection of this object's cpuset and that * of each proc in the job/app - if they intersect, then count this proc * against the limit */ nprocs = 0; for (n=0; n < node->procs->size; n++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (proc->name.jobid != jobid || proc->app_idx != app_idx) { continue; } locale = NULL; if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); if (hwloc_bitmap_intersects(avail, cpus)) { nprocs++; } } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: found %d procs limit %d", nprocs, limit); /* check against the limit */ while (limit < nprocs) { /* need to remove procs - do this in a semi-intelligent * manner to provide a little load balancing by cycling * across the objects beneath this one, removing procs * in a round-robin fashion until the limit is satisfied * * NOTE: I'm sure someone more knowledgeable with hwloc * will come up with a more efficient way to do this, so * consider this is a starting point */ /* find the first level that has more than * one child beneath it - if all levels * have only one child, then return this * object */ top = find_split(node->topology, obj); hwloc_obj_type_snprintf(dang, 64, top, 1); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: SPLIT AT LEVEL %s", dang); /* cycle across the children of this object */ nmax = 0; procmax = NULL; idx = 0; /* find the child with the most procs underneath it */ for (k=0; k < top->arity && limit < nprocs; k++) { /* get this object's available cpuset */ childcpus = opal_hwloc_base_get_available_cpus(node->topology, top->children[k]); nunder = 0; pptr = NULL; for (n=0; n < node->procs->size; n++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, n))) { continue; } if (proc->name.jobid != jobid || proc->app_idx != app_idx) { continue; } locale = NULL; if (orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return; } cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); if (hwloc_bitmap_intersects(childcpus, cpus)) { nunder++; if (NULL == pptr) { /* save the location of the first proc under this object */ pptr = proc; idx = n; } } } if (nmax < nunder) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: PROCS UNDER CHILD %d %d MAX %d", k, nunder, nmax); nmax = nunder; procmax = pptr; idxmax = idx; } } if (NULL == procmax) { /* can't find anything to remove - error out */ goto error; } /* remove it */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:ppr: removing proc at posn %d", idxmax); opal_pointer_array_set_item(node->procs, idxmax, NULL); node->num_procs--; node->slots_inuse--; if (node->slots_inuse < 0) { node->slots_inuse = 0; } nprocs--; *nmapped -= 1; OBJ_RELEASE(procmax); } } /* finished with this level - move up if necessary */ if (0 == ll) { return; } --(*level); prune(jobid, app_idx, node, level, nmapped); return; error: opal_output(0, "INFINITE LOOP"); }
hwloc::hwloc() { s_core_topology = std::pair<unsigned,unsigned>(0,0); s_core_capacity = 0 ; s_hwloc_topology = 0 ; s_hwloc_location = 0 ; s_process_binding = 0 ; for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ; hwloc_topology_init( & s_hwloc_topology ); hwloc_topology_load( s_hwloc_topology ); s_hwloc_location = hwloc_bitmap_alloc(); s_process_binding = hwloc_bitmap_alloc(); hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS ); // Choose a hwloc object type for the NUMA level, which may not exist. hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ; { // Object types to search, in order. static const hwloc_obj_type_t candidate_root_type[] = { HWLOC_OBJ_NODE /* NUMA region */ , HWLOC_OBJ_SOCKET /* hardware socket */ , HWLOC_OBJ_MACHINE /* local machine */ }; enum { CANDIDATE_ROOT_TYPE_COUNT = sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) }; for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) { if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) { root_type = candidate_root_type[k] ; } } } // Determine which of these 'root' types are available to this process. // The process may have been bound (e.g., by MPI) to a subset of these root types. // Determine current location of the master (calling) process> hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc(); hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD ); const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type ); unsigned root_base = max_root ; unsigned root_count = 0 ; unsigned core_per_root = 0 ; unsigned pu_per_core = 0 ; bool symmetric = true ; for ( unsigned i = 0 ; i < max_root ; ++i ) { const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i ); if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) { ++root_count ; // Remember which root (NUMA) object the master thread is running on. // This will be logical NUMA rank #0 for this process. if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) { root_base = i ; } // Count available cores: const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE ); unsigned core_count = 0 ; for ( unsigned j = 0 ; j < max_core ; ++j ) { const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE , j ); // If process' cpuset intersects core's cpuset then process can access this core. // Must use intersection instead of inclusion because the Intel-Phi // MPI may bind the process to only one of the core's hyperthreads. // // Assumption: if the process can access any hyperthread of the core // then it has ownership of the entire core. // This assumes that it would be performance-detrimental // to spawn more than one MPI process per core and use nested threading. if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { ++core_count ; const unsigned pu_count = hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , core->allowed_cpuset , HWLOC_OBJ_PU ); if ( pu_per_core == 0 ) pu_per_core = pu_count ; // Enforce symmetry by taking the minimum: pu_per_core = std::min( pu_per_core , pu_count ); if ( pu_count != pu_per_core ) symmetric = false ; } } if ( 0 == core_per_root ) core_per_root = core_count ; // Enforce symmetry by taking the minimum: core_per_root = std::min( core_per_root , core_count ); if ( core_count != core_per_root ) symmetric = false ; } } s_core_topology.first = root_count ; s_core_topology.second = core_per_root ; s_core_capacity = pu_per_core ; // Fill the 's_core' array for fast mapping from a core coordinate to the // hwloc cpuset object required for thread location querying and binding. for ( unsigned i = 0 ; i < max_root ; ++i ) { const unsigned root_rank = ( i + root_base ) % max_root ; const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank ); if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) { const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE ); unsigned core_count = 0 ; for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) { const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology , root->allowed_cpuset , HWLOC_OBJ_CORE , j ); if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) { s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ; ++core_count ; } } } } hwloc_bitmap_free( proc_cpuset_location ); if ( ! symmetric ) { std::cout << "KokkosArray::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology." << std::endl ; } }
static int rank_span(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { hwloc_obj_t obj; int num_objs, i, j, rc; orte_vpid_t num_ranked=0; orte_node_t *node; orte_proc_t *proc; orte_vpid_t vpid; int cnt; opal_list_item_t *item; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_span: for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* if the ranking is spanned, then we perform the * ranking as if it was one big node - i.e., we * rank one proc on each object, step to the next object * moving across all the nodes, then wrap around to the * first object on the first node. * * Node 0 Node 1 * Obj 0 Obj 1 Obj 0 Obj 1 * 0 4 1 5 2 6 3 7 * 8 12 9 13 10 14 11 15 */ /* In the interest of getting this committed in finite time, * just loop across the nodes and objects until all procs * are mapped */ vpid = jdata->num_procs; cnt = 0; while (cnt < app->num_procs) { for (item = opal_list_get_first(nodes); item != opal_list_get_end(nodes); item = opal_list_get_next(item)) { node = (orte_node_t*)item; /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_span: found %d objects on node %s with %d procs", num_objs, node->name, (int)node->num_procs); if (0 == num_objs) { return ORTE_ERR_NOT_SUPPORTED; } /* for each object */ for (i=0; i < num_objs && cnt < app->num_procs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_span: working object %d", i); /* cycle thru the procs on this node */ for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_span skipping proc %s - from another job, num_ranked %d", ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } /* ignore procs that are already assigned */ if (ORTE_VPID_INVALID != proc->name.vpid) { continue; } /* ignore procs from other apps */ if (proc->app_idx != app->idx) { continue; } /* protect against bozo case */ locale = NULL; if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } /* ignore procs not on this object */ if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_span: proc at position %d is not on object %d", j, i); continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_span: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; if (0 == cnt) { app->first_rank = proc->name.vpid; } cnt++; /* insert the proc into the jdata array - no harm if already there */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } /* track where the highest vpid landed - this is our * new bookmark */ jdata->bookmark = node; /* move to next object */ break; } } } } return ORTE_SUCCESS; }
static int rank_by(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *nodes, hwloc_obj_type_t target, unsigned cache_level) { hwloc_obj_t obj; int num_objs, i, j, rc; orte_vpid_t num_ranked=0; orte_node_t *node; orte_proc_t *proc; orte_vpid_t vpid; int cnt; opal_pointer_array_t objs; bool all_done; opal_list_item_t *item; hwloc_obj_t locale; if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { return rank_span(jdata, app, nodes, target, cache_level); } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) { return rank_fill(jdata, app, nodes, target, cache_level); } /* if ranking is not spanned or filled, then we * default to assign ranks sequentially across * target objects within a node until that node * is fully ranked, and then move on to the next * node * * Node 0 Node 1 * Obj 0 Obj 1 Obj 0 Obj 1 * 0 2 1 3 8 10 9 11 * 4 6 5 7 12 14 13 15 */ /* setup the pointer array */ OBJ_CONSTRUCT(&objs, opal_pointer_array_t); opal_pointer_array_init(&objs, 2, INT_MAX, 2); vpid = jdata->num_procs; cnt = 0; for (item = opal_list_get_first(nodes); item != opal_list_get_end(nodes); item = opal_list_get_next(item)) { node = (orte_node_t*)item; /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_by: found %d objects on node %s with %d procs", num_objs, node->name, (int)node->num_procs); if (0 == num_objs) { return ORTE_ERR_NOT_SUPPORTED; } /* collect all the objects */ for (i=0; i < num_objs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE); opal_pointer_array_set_item(&objs, i, obj); } /* cycle across the objects, assigning a proc to each one, * until all procs have been assigned - unfortunately, since * more than this job may be mapped onto a node, the number * of procs on the node can't be used to tell us when we * are done. Instead, we have to just keep going until all * procs are ranked - which means we have to make one extra * pass thru the loop * * Perhaps someday someone will come up with a more efficient * algorithm, but this works for now. */ all_done = false; while (!all_done && cnt < app->num_procs) { all_done = true; /* cycle across the objects */ for (i=0; i < num_objs && cnt < app->num_procs; i++) { obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i); /* find the next proc on this object */ for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d", ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } /* ignore procs that are already ranked */ if (ORTE_VPID_INVALID != proc->name.vpid) { continue; } /* ignore procs from other apps */ if (proc->app_idx != app->idx) { continue; } if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { continue; } /* ignore procs on other objects */ if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_by: proc at position %d is not on object %d", j, i); continue; } proc->name.vpid = vpid++; if (0 == cnt) { app->first_rank = proc->name.vpid; } cnt++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid)); /* insert the proc into the jdata array - no harm if already there */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } /* flag that one was mapped */ all_done = false; /* track where the highest vpid landed - this is our * new bookmark */ jdata->bookmark = node; /* move to next object */ break; } } } } /* cleanup */ OBJ_DESTRUCT(&objs); return ORTE_SUCCESS; }
static int sm_btl_first_time_init(mca_btl_sm_t *sm_btl, int32_t my_smp_rank, int n) { size_t length, length_payload; sm_fifo_t *my_fifos; int my_mem_node, num_mem_nodes, i, rc; mca_mpool_base_resources_t *res = NULL; mca_btl_sm_component_t* m = &mca_btl_sm_component; /* Assume we don't have hwloc support and fill in dummy info */ mca_btl_sm_component.mem_node = my_mem_node = 0; mca_btl_sm_component.num_mem_nodes = num_mem_nodes = 1; /* If we have hwloc support, then get accurate information */ if (NULL != opal_hwloc_topology) { i = opal_hwloc_base_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, OPAL_HWLOC_AVAILABLE); /* If we find >0 NUMA nodes, then investigate further */ if (i > 0) { int numa=0, w; unsigned n_bound=0; hwloc_cpuset_t avail; hwloc_obj_t obj; /* JMS This tells me how many numa nodes are *available*, but it's not how many are being used *by this job*. Note that this is the value we've previously used (from the previous carto-based implementation), but it really should be improved to be how many NUMA nodes are being used *in this job*. */ mca_btl_sm_component.num_mem_nodes = num_mem_nodes = i; /* if we are not bound, then there is nothing further to do */ if (NULL != opal_process_info.cpuset) { /* count the number of NUMA nodes to which we are bound */ for (w=0; w < i; w++) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_NODE, 0, w, OPAL_HWLOC_AVAILABLE))) { continue; } /* get that NUMA node's available cpus */ avail = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); /* see if we intersect */ if (hwloc_bitmap_intersects(avail, opal_hwloc_my_cpuset)) { n_bound++; numa = w; } } /* if we are located on more than one NUMA, or we didn't find * a NUMA we are on, then not much we can do */ if (1 == n_bound) { mca_btl_sm_component.mem_node = my_mem_node = numa; } else { mca_btl_sm_component.mem_node = my_mem_node = -1; } } } } if (NULL == (res = calloc(1, sizeof(*res)))) { return OPAL_ERR_OUT_OF_RESOURCE; } /* lookup shared memory pool */ mca_btl_sm_component.sm_mpools = (mca_mpool_base_module_t **)calloc(num_mem_nodes, sizeof(mca_mpool_base_module_t *)); /* Disable memory binding, because each MPI process will claim pages in the * mpool for their local NUMA node */ res->mem_node = -1; if (OPAL_SUCCESS != (rc = setup_mpool_base_resources(m, res))) { free(res); return rc; } /* now that res is fully populated, create the thing */ mca_btl_sm_component.sm_mpools[0] = mca_mpool_base_module_create(mca_btl_sm_component.sm_mpool_name, sm_btl, res); /* Sanity check to ensure that we found it */ if (NULL == mca_btl_sm_component.sm_mpools[0]) { free(res); return OPAL_ERR_OUT_OF_RESOURCE; } mca_btl_sm_component.sm_mpool = mca_btl_sm_component.sm_mpools[0]; mca_btl_sm_component.sm_mpool_base = mca_btl_sm_component.sm_mpools[0]->mpool_base(mca_btl_sm_component.sm_mpools[0]); /* create a list of peers */ mca_btl_sm_component.sm_peers = (struct mca_btl_base_endpoint_t**) calloc(n, sizeof(struct mca_btl_base_endpoint_t*)); if (NULL == mca_btl_sm_component.sm_peers) { free(res); return OPAL_ERR_OUT_OF_RESOURCE; } /* remember that node rank zero is already attached */ if (0 != my_smp_rank) { if (OPAL_SUCCESS != (rc = sm_segment_attach(m))) { free(res); return rc; } } /* it is now safe to free the mpool resources */ free(res); /* check to make sure number of local procs is within the * specified limits */ if(mca_btl_sm_component.sm_max_procs > 0 && mca_btl_sm_component.num_smp_procs + n > mca_btl_sm_component.sm_max_procs) { return OPAL_ERROR; } mca_btl_sm_component.shm_fifo = (volatile sm_fifo_t **)mca_btl_sm_component.sm_seg->module_data_addr; mca_btl_sm_component.shm_bases = (char**)(mca_btl_sm_component.shm_fifo + n); mca_btl_sm_component.shm_mem_nodes = (uint16_t*)(mca_btl_sm_component.shm_bases + n); /* set the base of the shared memory segment */ mca_btl_sm_component.shm_bases[mca_btl_sm_component.my_smp_rank] = (char*)mca_btl_sm_component.sm_mpool_base; mca_btl_sm_component.shm_mem_nodes[mca_btl_sm_component.my_smp_rank] = (uint16_t)my_mem_node; /* initialize the array of fifo's "owned" by this process */ if(NULL == (my_fifos = (sm_fifo_t*)mpool_calloc(FIFO_MAP_NUM(n), sizeof(sm_fifo_t)))) return OPAL_ERR_OUT_OF_RESOURCE; mca_btl_sm_component.shm_fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; /* cache the pointer to the 2d fifo array. These addresses * are valid in the current process space */ mca_btl_sm_component.fifo = (sm_fifo_t**)malloc(sizeof(sm_fifo_t*) * n); if(NULL == mca_btl_sm_component.fifo) return OPAL_ERR_OUT_OF_RESOURCE; mca_btl_sm_component.fifo[mca_btl_sm_component.my_smp_rank] = my_fifos; mca_btl_sm_component.mem_nodes = (uint16_t *) malloc(sizeof(uint16_t) * n); if(NULL == mca_btl_sm_component.mem_nodes) return OPAL_ERR_OUT_OF_RESOURCE; /* initialize fragment descriptor free lists */ /* allocation will be for the fragment descriptor and payload buffer */ length = sizeof(mca_btl_sm_frag1_t); length_payload = sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.eager_limit; i = opal_free_list_init (&mca_btl_sm_component.sm_frags_eager, length, opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag1_t), length_payload, opal_cache_line_size, mca_btl_sm_component.sm_free_list_num, mca_btl_sm_component.sm_free_list_max, mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; length = sizeof(mca_btl_sm_frag2_t); length_payload = sizeof(mca_btl_sm_hdr_t) + mca_btl_sm_component.max_frag_size; i = opal_free_list_init (&mca_btl_sm_component.sm_frags_max, length, opal_cache_line_size, OBJ_CLASS(mca_btl_sm_frag2_t), length_payload, opal_cache_line_size, mca_btl_sm_component.sm_free_list_num, mca_btl_sm_component.sm_free_list_max, mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; i = opal_free_list_init (&mca_btl_sm_component.sm_frags_user, sizeof(mca_btl_sm_user_t), opal_cache_line_size, OBJ_CLASS(mca_btl_sm_user_t), sizeof(mca_btl_sm_hdr_t), opal_cache_line_size, mca_btl_sm_component.sm_free_list_num, mca_btl_sm_component.sm_free_list_max, mca_btl_sm_component.sm_free_list_inc, mca_btl_sm_component.sm_mpool, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; mca_btl_sm_component.num_outstanding_frags = 0; mca_btl_sm_component.num_pending_sends = 0; i = opal_free_list_init(&mca_btl_sm_component.pending_send_fl, sizeof(btl_sm_pending_send_item_t), 8, OBJ_CLASS(opal_free_list_item_t), 0, 0, 16, -1, 32, NULL, 0, NULL, NULL, NULL); if ( OPAL_SUCCESS != i ) return i; /* set flag indicating btl has been inited */ sm_btl->btl_inited = true; return OPAL_SUCCESS; }
static int rank_fill(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { orte_app_context_t *app; hwloc_obj_t obj; int num_objs, i, j, m, n, rc; orte_vpid_t num_ranked=0; orte_node_t *node; orte_proc_t *proc, *pptr; orte_vpid_t vpid; int cnt; hwloc_obj_t locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_fill: for job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* if the ranking is fill, then we rank all the procs * within a given object before moving on to the next * * Node 0 Node 1 * Obj 0 Obj 1 Obj 0 Obj 1 * 0 1 4 5 8 9 12 13 * 2 3 6 7 10 11 14 15 */ vpid = 0; for (n=0; n < jdata->apps->size; n++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { continue; } cnt = 0; for (m=0; m < jdata->map->nodes->size; m++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, m))) { continue; } /* get the number of objects - only consider those we can actually use */ num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology->topo, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_fill: found %d objects on node %s with %d procs", num_objs, node->name, (int)node->num_procs); if (0 == num_objs) { return ORTE_ERR_NOT_SUPPORTED; } /* for each object */ for (i=0; i < num_objs && cnt < app->num_procs; i++) { obj = opal_hwloc_base_get_obj_by_type(node->topology->topo, target, cache_level, i, OPAL_HWLOC_AVAILABLE); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_fill: working object %d", i); /* cycle thru the procs on this node */ for (j=0; j < node->procs->size && cnt < app->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_fill skipping proc %s - from another job, num_ranked %d", ORTE_NAME_PRINT(&proc->name), num_ranked); continue; } /* ignore procs that are already assigned */ if (ORTE_VPID_INVALID != proc->name.vpid) { continue; } /* ignore procs from other apps */ if (proc->app_idx != app->idx) { continue; } /* protect against bozo case */ locale = NULL; if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { ORTE_ERROR_LOG(ORTE_ERROR); return ORTE_ERROR; } /* ignore procs not on this object */ if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_fill: proc at position %d is not on object %d", j, i); continue; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:rank_fill: assigning vpid %s", ORTE_VPID_PRINT(vpid)); proc->name.vpid = vpid++; if (0 == cnt) { app->first_rank = proc->name.vpid; } cnt++; /* insert the proc into the jdata array */ if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->name.vpid))) { OBJ_RELEASE(pptr); } OBJ_RETAIN(proc); if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); return rc; } /* track where the highest vpid landed - this is our * new bookmark */ jdata->bookmark = node; } } } } return ORTE_SUCCESS; }