/******************* FUNCTION *********************/ const char* TopoHwloc::getLevelName ( int id ) const { const char * res = hwloc_obj_type_string((hwloc_obj_type_t)id); allocAssume(res != NULL,"Failed to conver topological level id ... from hwloc.");//id return res; }
void output_synthetic(hwloc_topology_t topology, const char *filename, int logical __hwloc_attribute_unused, int legend __hwloc_attribute_unused, int verbose_mode __hwloc_attribute_unused) { FILE *output; hwloc_obj_t obj = hwloc_get_root_obj(topology); int arity; if (!obj->symmetric_subtree) { fprintf(stderr, "Cannot output assymetric topology in synthetic format.\n"); fprintf(stderr, "Adding --no-io may help making the topology symmetric.\n"); return; } if (!filename || !strcmp(filename, "-")) output = stdout; else { output = open_file(filename, "w"); if (!output) { fprintf(stderr, "Failed to open %s for writing (%s)\n", filename, strerror(errno)); return; } } arity = obj->arity; while (arity) { obj = obj->first_child; fprintf(output, "%s:%u ", hwloc_obj_type_string(obj->type), arity); arity = obj->arity; } fprintf(output, "\n"); if (output != stdout) fclose(output); }
static void print_obj_info(hwloc_obj_t obj) { int i; if (obj->type == HWLOC_OBJ_CACHE) dprint(obj->depth, "[%s] L%u cache size: %lu\n", hwloc_obj_type_string(obj->type), obj->attr->cache.depth, obj->attr->cache.size); else { if (obj->memory.total_memory || obj->memory.local_memory) dprint(obj->depth, "[%s:%u] total memory: %lu; local memory: %lu\n", hwloc_obj_type_string(obj->type), obj->os_index, obj->memory.total_memory, obj->memory.local_memory); else dprint(obj->depth, "[%s:%u]\n", hwloc_obj_type_string(obj->type), obj->os_index); } for (i = 0; i < obj->arity; i++) print_obj_info(obj->children[i]); }
static int chk_mem_bind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, int print) { hwloc_membind_policy_t policy; hwloc_bitmap_t checkset = hwloc_bitmap_alloc(); if(hwloc_get_membind(topology, checkset, &policy, HWLOC_MEMBIND_THREAD|HWLOC_MEMBIND_BYNODESET) == -1){ perror("get_membind"); hwloc_bitmap_free(checkset); return -1; } if(print){ char * policy_name; switch(policy){ case HWLOC_MEMBIND_DEFAULT: policy_name = "DEFAULT"; break; case HWLOC_MEMBIND_FIRSTTOUCH: policy_name = "FIRSTTOUCH"; break; case HWLOC_MEMBIND_BIND: policy_name = "BIND"; break; case HWLOC_MEMBIND_INTERLEAVE: policy_name = "INTERLEAVE"; break; case HWLOC_MEMBIND_NEXTTOUCH: policy_name = "NEXTTOUCH"; break; case HWLOC_MEMBIND_MIXED: policy_name = "MIXED"; break; default: policy_name=NULL; break; } hwloc_obj_t mem_obj = hwloc_get_first_largest_obj_inside_cpuset(topology, checkset); printf("membind(%s)=%s:%d\n",policy_name,hwloc_obj_type_string(mem_obj->type),mem_obj->logical_index); } if(nodeset == NULL) return -1; int ret = hwloc_bitmap_isequal(nodeset,checkset); hwloc_bitmap_free(checkset); return ret ? 0 : -1; }
int main(void) { hwloc_topology_t topology; char *string = NULL; hwloc_obj_t obj; hwloc_bitmap_t set; hwloc_topology_init(&topology); hwloc_topology_set_synthetic(topology, SYNTHETIC_TOPOLOGY_DESCRIPTION); hwloc_topology_load(topology); set = hwloc_bitmap_alloc(); hwloc_bitmap_sscanf(set, GIVEN_CPUSET_STRING); obj = hwloc_get_obj_covering_cpuset(topology, set); assert(obj); fprintf(stderr, "found covering object type %s covering cpuset %s\n", hwloc_obj_type_string(obj->type), GIVEN_CPUSET_STRING); assert(hwloc_bitmap_isincluded(set, obj->cpuset)); hwloc_bitmap_asprintf(&string, obj->cpuset); fprintf(stderr, "covering object of %s is %s, expected %s\n", GIVEN_CPUSET_STRING, string, EXPECTED_CPUSET_STRING); assert(!strcmp(EXPECTED_CPUSET_STRING, string)); free(string); hwloc_bitmap_sscanf(set, GIVEN_LARGESPLIT_CPUSET_STRING); obj = hwloc_get_obj_covering_cpuset(topology, set); assert(obj == hwloc_get_root_obj(topology)); fprintf(stderr, "found system as covering object of first+last cpus cpuset %s\n", GIVEN_LARGESPLIT_CPUSET_STRING); hwloc_bitmap_sscanf(set, GIVEN_TOOLARGE_CPUSET_STRING); obj = hwloc_get_obj_covering_cpuset(topology, set); assert(!obj); fprintf(stderr, "found no covering object for too-large cpuset %s\n", GIVEN_TOOLARGE_CPUSET_STRING); hwloc_bitmap_free(set); hwloc_topology_destroy(topology); return EXIT_SUCCESS; }
static int chk_cpu_bind(hwloc_topology_t topology, hwloc_cpuset_t cpuset, int print) { hwloc_bitmap_t checkset = hwloc_bitmap_alloc(); if(hwloc_get_cpubind(topology, checkset, HWLOC_CPUBIND_THREAD) == -1){ perror("get_cpubind"); hwloc_bitmap_free(checkset); return -1; } if(print){ hwloc_obj_t cpu_obj = hwloc_get_first_largest_obj_inside_cpuset(topology, checkset); printf("cpubind=%s:%d\n",hwloc_obj_type_string(cpu_obj->type),cpu_obj->logical_index); } if(cpuset == NULL) return -1; int ret = hwloc_bitmap_isequal(cpuset,checkset); hwloc_bitmap_free(checkset); return ret ? 0 : -1; }
static void _add_hwloc_cpuset( hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype, hwloc_obj_t obj, uint32_t taskid, int bind_verbose, hwloc_bitmap_t cpuset) { struct hwloc_obj *pobj; /* if requested binding overlaps the granularity */ /* use the ancestor cpuset instead of the object one */ if (hwloc_compare_types(hwtype, req_hwtype) > 0) { /* Get the parent object of req_hwtype or the */ /* one just above if not found (meaning of >0)*/ /* (useful for ldoms binding with !NUMA nodes)*/ pobj = obj->parent; while (pobj != NULL && hwloc_compare_types(pobj->type, req_hwtype) > 0) pobj = pobj->parent; if (pobj != NULL) { if (bind_verbose) info("task/cgroup: task[%u] higher level %s " "found", taskid, hwloc_obj_type_string(pobj->type)); hwloc_bitmap_or(cpuset, cpuset, pobj->allowed_cpuset); } else { /* should not be executed */ if (bind_verbose) info("task/cgroup: task[%u] no higher level " "found", taskid); hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset); } } else { hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset); } }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else hwloc_obj_type_t socket_or_node; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; pid_t pid = job->envtp->task_pid; cpu_bind_type_t bind_type; int bind_verbose = 0; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; size_t tssize; cpu_set_t ts; bind_type = job->cpu_bind_type ; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) bind_verbose = 1 ; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); cpuset = hwloc_bitmap_alloc(); hwloc_topology_load(topology); if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & CPU_BIND_TO_LDOMS) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else { char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } /* There are two "distributions," controlled by the * -m option of srun and friends. The first is the * distribution of tasks to nodes. The second is the * distribution of allocated cpus to tasks for * binding. This code is handling the second * distribution. Here's how the values get set, based * on the value of -m * * SLURM_DIST_CYCLIC = srun -m cyclic * SLURM_DIST_BLOCK = srun -m block * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic * * In the first two cases, the user only specified the * first distribution. The second distribution * defaults to cyclic. In the second two cases, the * user explicitly requested a second distribution of * cyclic. So all these four cases correspond to a * second distribution of cyclic. So we want to call * _task_cgroup_cpuset_dist_cyclic. * * If the user explicitly specifies a second * distribution of block, or if * CR_CORE_DEFAULT_DIST_BLOCK is configured and the * user does not explicitly specify a second * distribution of cyclic, the second distribution is * block, and we need to call * _task_cgroup_cpuset_dist_block. In these cases, * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK * or SLURM_DIST_BLOCK_BLOCK. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 384. */ switch (job->task_dist) { case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: _task_cgroup_cpuset_dist_cyclic( topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; default: _task_cgroup_cpuset_dist_block( topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if (sched_setaffinity(pid,tssize,&ts)) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else char mstr[1 + CPU_SETSIZE / 4]; cpu_bind_type_t bind_type; cpu_set_t ts; hwloc_obj_t obj; hwloc_obj_type_t socket_or_node; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int bind_verbose = 0; int rc = SLURM_SUCCESS, match; pid_t pid = job->envtp->task_pid; size_t tssize; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); hwloc_topology_load(topology); cpuset = hwloc_bitmap_alloc(); int spec_threads = 0; if (job->batch) { jnpus = job->cpus; job->cpus_per_task = job->cpus; } else jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if ((conf->task_plugin_param & CPU_BIND_VERBOSE) || (bind_type & CPU_BIND_VERBOSE)) bind_verbose = 1 ; if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else if (bind_type & CPU_BIND_TO_BOARDS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "board level binding",taskid); req_hwtype = HWLOC_OBJ_GROUP; } else if (bind_type & bind_mode_ldom) { req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); //info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if ((job->job_core_spec != (uint16_t) NO_VAL) && (job->job_core_spec & CORE_SPEC_THREAD) && (job->job_core_spec != CORE_SPEC_THREAD)) { spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD); } if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid, hwloc_obj_type_string(hwtype)); } else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) && (nobj < jnpus)) { info("task/cgroup: task[%u] not enough %s objects (%d < %d), " "disabling affinity", taskid, hwloc_obj_type_string(hwtype), nobj, jnpus); } else if (bind_type & bind_mode) { /* Explicit binding mode specified by the user * Bind the taskid in accordance with the specified mode */ obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0); match = hwloc_bitmap_isequal(obj->complete_cpuset, obj->allowed_cpuset); if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) { info("task/cgroup: entire node must be allocated, " "disabling affinity, task[%u]", taskid); fprintf(stderr, "Requested cpu_bind option requires " "entire node to be allocated; disabling " "affinity\n"); } else { if (bind_verbose) { info("task/cgroup: task[%u] is requesting " "explicit binding mode", taskid); } _get_sched_cpuset(topology, hwtype, req_hwtype, &ts, job); tssize = sizeof(cpu_set_t); fstatus = SLURM_SUCCESS; if (job->job_core_spec != (uint16_t) NO_VAL) _validate_mask(taskid, obj, &ts); if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); error("sched_setaffinity rc = %d", rc); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); } _slurm_chkaffinity(&ts, job, rc); } } else { /* Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). */ char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity dist %u", taskid, hwloc_obj_type_string(hwtype), job->task_dist); } /* See srun man page for detailed information on --distribution * option. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 368 */ switch (job->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: _task_cgroup_cpuset_dist_cyclic(topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, &ts, tssize) == 0) { fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "taskset '%s'", taskid, str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] set taskset '%s'", taskid, str); } _slurm_chkaffinity(&ts, job, rc); } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
/* mapping by hwloc object looks a lot like mapping by node, * but has the added complication of possibly having different * numbers of objects on each node */ int orte_rmaps_rr_byobj(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level) { int i, j, nprocs_mapped; orte_node_t *node; orte_proc_t *proc; opal_list_item_t *item; int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0; int extra_procs_to_assign=0, nxtra_nodes=0, idx; hwloc_obj_t obj=NULL; unsigned int nobjs; float balance; bool add_one=false; /* there are two modes for mapping by object: span and not-span. The * span mode essentially operates as if there was just a single * "super-node" in the system - i.e., it balances the load across * all objects of the indicated type regardless of their location. * In essence, it acts as if we placed one proc on each object, cycling * across all objects on all nodes, and then wrapped around to place * another proc on each object, doing so until all procs were placed. * * In contrast, the non-span mode operates similar to byslot mapping. * All slots on each node are filled, assigning each proc to an object * on that node in a balanced fashion, and then the mapper moves on * to the next node. Thus, procs tend to be "front loaded" onto the * list of nodes, as opposed to being "load balanced" in the span mode */ if (ORTE_MAPPING_SPAN & jdata->map->mapping) { return byobj_span(jdata, app, node_list, num_slots, num_procs, target, cache_level); } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping no-span by %s for job %s slots %d num_procs %lu", hwloc_obj_type_string(target), ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* quick check to see if we can map all the procs - can't * do more because we don't know how many total objects exist * across all the nodes */ if (num_slots < (app->num_procs * orte_rmaps_base.cpus_per_rank)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } /* compute how many extra procs to put on each node */ if (1 == opal_list_get_size(node_list)) { /* if there is only one node, then they all have to go on it */ extra_procs_to_assign = app->num_procs; } else { balance = (float)(((jdata->num_procs + app->num_procs)*orte_rmaps_base.cpus_per_rank) - num_slots) / (float)opal_list_get_size(node_list); extra_procs_to_assign = (int)balance; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = ((jdata->num_procs + app->num_procs)*orte_rmaps_base.cpus_per_rank) - num_slots - (extra_procs_to_assign * opal_list_get_size(node_list)); /* add one so that we add an extra proc to the first nodes * until all procs are mapped */ extra_procs_to_assign++; /* flag that we added one */ add_one = true; } } } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping no-span by %s extra_procs %d extra_nodes %d", hwloc_obj_type_string(target), extra_procs_to_assign, nxtra_nodes); nprocs_mapped = 0; for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); return ORTE_ERR_SILENT; } /* add this node to the map, if reqd */ if (!node->mapped) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(idx); return idx; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } /* compute the number of procs to go on this node */ if (add_one) { if (0 == nxtra_nodes) { --extra_procs_to_assign; add_one = false; } else { --nxtra_nodes; } } if (node->slots <= node->slots_inuse) { /* everybody takes at least the extras */ num_procs_to_assign = extra_procs_to_assign; } else { num_procs_to_assign = (node->slots - node->slots_inuse)/orte_rmaps_base.cpus_per_rank + extra_procs_to_assign; if (app->num_procs < num_procs_to_assign) { /* might have more slots than procs */ num_procs_to_assign = app->num_procs; } } /* get the number of objects of this type on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: nprocs-to-assign %d for %d objs on node %s", num_procs_to_assign, nobjs, node->name); /* if there are no objects of this type, then report the error * and abort - this can happen, for example, on systems that * don't report "sockets" as an independent object. However, IF * this object is the default one - i.e., not specified by the * user - then we can fall back to mapping by slot */ if (0 == nobjs) { if (ORTE_MAPPING_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(target), node->name); return ORTE_ERR_SILENT; } else { /* this was the default mapping policy, so clear the map * of any prior work and indicate that map-by slot is reqd */ for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } for (idx=0; idx < node->procs->size; idx++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, idx))) { continue; } if (proc->name.jobid != jdata->jobid) { continue; } --node->num_procs; OBJ_RELEASE(proc); opal_pointer_array_set_item(node->procs, idx, NULL); } if (0 == node->num_procs) { node->mapped = false; OBJ_RELEASE(node); opal_pointer_array_set_item(jdata->map->nodes, i, NULL); } } return ORTE_ERR_NOT_SUPPORTED; } } /* compute the number of procs to go on each object */ nperobj = num_procs_to_assign / nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: placing %d procs on each object", nperobj); if ((int)(nperobj * nobjs) < num_procs_to_assign) { /* compute how many objs need an extra proc */ nxtra_objs = num_procs_to_assign - nperobj * nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs); } /* loop through the number of objects */ for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) { /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* map the reqd number of procs */ if (0 < nxtra_objs) { nprocs = nperobj + 1; --nxtra_objs; } else { nprocs = nperobj; } for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; proc->locale = obj; } } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < (int)node->num_procs) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } if (nprocs_mapped == app->num_procs) { /* we are done */ break; } } return ORTE_SUCCESS; }
static int byobj_span(orte_job_t *jdata, orte_app_context_t *app, opal_list_t *node_list, orte_std_cntr_t num_slots, orte_vpid_t num_procs, hwloc_obj_type_t target, unsigned cache_level) { int i, j, nprocs_mapped, lag, delta, navg; orte_node_t *node; orte_proc_t *proc; opal_list_item_t *item; int num_procs_to_assign, nperobj, nprocs, nxtra_objs=0; int extra_procs_to_assign=0, nxtra_nodes=0, idx; hwloc_obj_t obj=NULL; unsigned int nobjs; float balance; bool add_one=false; bool oversubscribed=false; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping span by %s for job %s slots %d num_procs %lu", hwloc_obj_type_string(target), ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs); /* quick check to see if we can map all the procs - can't * do more because we don't know how many total objects exist * across all the nodes */ if (num_slots < (int)app->num_procs) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, app->num_procs, app->app); return ORTE_ERR_SILENT; } oversubscribed = true; } /* divide the procs evenly across all nodes - this is the * average we have to maintain as we go, but we adjust * the number on each node to reflect its available slots. * Obviously, if all nodes have the same number of slots, * then the avg is what we get on each node - this is * the most common situation. */ navg = app->num_procs / opal_list_get_size(node_list); if (0 == navg) { /* if there are less procs than nodes, we have to * place at least one/node */ navg = 1; } /* compute how many extra procs to put on each node */ balance = (float)((jdata->num_procs + app->num_procs) - (navg * opal_list_get_size(node_list))) / (float)opal_list_get_size(node_list); extra_procs_to_assign = (int)balance; if (0 < (balance - (float)extra_procs_to_assign)) { /* compute how many nodes need an extra proc */ nxtra_nodes = (jdata->num_procs + app->num_procs) - ((navg + extra_procs_to_assign) * opal_list_get_size(node_list)); /* add one so that we add an extra proc to the first nodes * until all procs are mapped */ extra_procs_to_assign++; /* flag that we added one */ add_one = true; } opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr: mapping by %s navg %d extra_procs %d extra_nodes %d", hwloc_obj_type_string(target), navg, extra_procs_to_assign, nxtra_nodes); nprocs_mapped = 0; lag = 0; for (item = opal_list_get_first(node_list); item != opal_list_get_end(node_list); item = opal_list_get_next(item)) { node = (orte_node_t*)item; /* bozo check */ if (NULL == node->topology) { orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing", true, node->name); return ORTE_ERR_SILENT; } /* add this node to the map, if reqd */ if (!node->mapped) { if (ORTE_SUCCESS > (idx = opal_pointer_array_add(jdata->map->nodes, (void*)node))) { ORTE_ERROR_LOG(idx); return idx; } node->mapped = true; OBJ_RETAIN(node); /* maintain accounting on object */ ++(jdata->map->num_nodes); } /* compute the number of procs to go on this node */ if (add_one) { if (0 == nxtra_nodes) { --extra_procs_to_assign; add_one = false; } else { --nxtra_nodes; } } if (oversubscribed) { /* everybody just takes their share */ num_procs_to_assign = navg + extra_procs_to_assign; } else { /* if we are not oversubscribed, then there are enough * slots to handle all the procs. However, not every * node will have the same number of slots, so we * have to track how many procs to "shift" elsewhere * to make up the difference */ if (node->slots <= node->slots_inuse) { /* if there are no extras to take, then we can * safely remove this node as we don't need it */ if (0 == extra_procs_to_assign) { opal_pointer_array_set_item(jdata->map->nodes, node->index, NULL); OBJ_RELEASE(node); --(jdata->map->num_nodes); /* update how many we are lagging behind */ lag += navg; continue; } /* everybody has to take at least the extras */ num_procs_to_assign = extra_procs_to_assign; /* update how many we are lagging behind */ lag += navg; } else { /* if slots < avg, then take all */ if ((node->slots - node->slots_inuse) < navg) { num_procs_to_assign = (node->slots - node->slots_inuse) + extra_procs_to_assign; /* update how many we are lagging behind */ lag += navg - (node->slots - node->slots_inuse); } else { /* take the avg plus as much of the "lag" as we can */ delta = 0; if (0 < lag) { delta = (node->slots - node->slots_inuse) - navg; if (lag < delta) { delta = lag; } lag -= delta; } num_procs_to_assign = navg + delta + extra_procs_to_assign; } } } /* get the number of objects of this type on this node */ nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target, cache_level, OPAL_HWLOC_AVAILABLE); opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: found %d objs on node %s", nobjs, node->name); /* compute the number of procs to go on each object */ nperobj = num_procs_to_assign / nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: placing %d procs on each object", nperobj); if ((int)(nperobj * nobjs) < num_procs_to_assign) { /* compute how many objs need an extra proc */ nxtra_objs = num_procs_to_assign - nperobj * nobjs; opal_output_verbose(2, orte_rmaps_base_framework.framework_output, "mca:rmaps:rr:byobj: adding 1 extra proc to the first %d objects, if needed", nxtra_objs); } /* loop through the number of objects */ for (i=0; i < (int)nobjs && nprocs_mapped < (int)app->num_procs; i++) { /* get the hwloc object */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(node->topology, target, cache_level, i, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); return ORTE_ERR_NOT_FOUND; } /* map the reqd number of procs */ if (0 < nxtra_objs) { nprocs = nperobj + 1; --nxtra_objs; } else { nprocs = nperobj; } for (j=0; j < nprocs && nprocs_mapped < app->num_procs; j++) { if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) { return ORTE_ERR_OUT_OF_RESOURCE; } nprocs_mapped++; proc->locale = obj; } /* keep track of the node we last used */ jdata->bookmark = node; } /* not all nodes are equal, so only set oversubscribed for * this node if it is in that state */ if (node->slots < (int)node->num_procs) { /* flag the node as oversubscribed so that sched-yield gets * properly set */ node->oversubscribed = true; } if (nprocs_mapped == app->num_procs) { /* we are done */ break; } } return ORTE_SUCCESS; }
static int bind_in_place(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node downwards * until we find an unused object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind in place for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(map->binding) || !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* get the index of this location */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, proc->locale, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)proc->locale->userdata; data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u", ORTE_NAME_PRINT(&proc->name), hwloc_obj_type_string(proc->locale->type), idx); /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, proc->locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, proc->locale); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); /* record the location */ proc->bind_location = proc->locale; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(proc->locale->type), idx, node->name); } } return ORTE_SUCCESS; }
static void hwloc_info_show_obj(hwloc_obj_t obj, const char *type, const char *prefix, int verbose) { char s[128]; unsigned i; if (verbose < 0) return; printf("%s type = %s\n", prefix, hwloc_obj_type_string(obj->type)); printf("%s full type = %s\n", prefix, type); printf("%s logical index = %u\n", prefix, obj->logical_index); if (obj->os_index != (unsigned) -1) printf("%s os index = %u\n", prefix, obj->os_index); if (obj->name) printf("%s name = %s\n", prefix, obj->name); if (obj->depth != (unsigned) -1) printf("%s depth = %u\n", prefix, obj->depth); printf("%s sibling rank = %u\n", prefix, obj->sibling_rank); printf("%s children = %u\n", prefix, obj->arity); if (obj->memory.local_memory) printf("%s local memory = %llu\n", prefix, (unsigned long long) obj->memory.local_memory); if (obj->memory.total_memory) printf("%s total memory = %llu\n", prefix, (unsigned long long) obj->memory.total_memory); if (obj->cpuset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->cpuset); printf("%s cpuset = %s\n", prefix, s); } if (obj->complete_cpuset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->complete_cpuset); printf("%s complete cpuset = %s\n", prefix, s); } if (obj->online_cpuset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->online_cpuset); printf("%s online cpuset = %s\n", prefix, s); } if (obj->allowed_cpuset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->allowed_cpuset); printf("%s allowed cpuset = %s\n", prefix, s); } if (obj->nodeset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->nodeset); printf("%s nodeset = %s\n", prefix, s); } if (obj->complete_nodeset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->complete_nodeset); printf("%s complete nodeset = %s\n", prefix, s); } if (obj->allowed_nodeset) { hwloc_bitmap_snprintf(s, sizeof(s), obj->allowed_nodeset); printf("%s allowed nodeset = %s\n", prefix, s); } switch (obj->type) { case HWLOC_OBJ_CACHE: printf("%s attr cache depth = %u\n", prefix, obj->attr->cache.depth); switch (obj->attr->cache.type) { case HWLOC_OBJ_CACHE_UNIFIED: printf("%s attr cache type = Unified\n", prefix); break; case HWLOC_OBJ_CACHE_DATA: printf("%s attr cache type = Data\n", prefix); break; case HWLOC_OBJ_CACHE_INSTRUCTION: printf("%s attr cache type = Instruction\n", prefix); break; } printf("%s attr cache size = %llu\n", prefix, (unsigned long long) obj->attr->cache.size); printf("%s attr cache line size = %u\n", prefix, obj->attr->cache.linesize); if (obj->attr->cache.associativity == -1) printf("%s attr cache ways = Fully-associative\n", prefix); else if (obj->attr->cache.associativity != 0) printf("%s attr cache ways = %d\n", prefix, obj->attr->cache.associativity); break; case HWLOC_OBJ_GROUP: printf("%s attr group depth = %u\n", prefix, obj->attr->group.depth); break; case HWLOC_OBJ_BRIDGE: switch (obj->attr->bridge.upstream_type) { case HWLOC_OBJ_BRIDGE_HOST: printf("%s attr bridge upstream type = Host\n", prefix); break; case HWLOC_OBJ_BRIDGE_PCI: printf("%s attr bridge upstream type = PCI\n", prefix); printf("%s attr PCI bus id = %04x:%02x:%02x.%01x\n", prefix, obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func); printf("%s attr PCI class = %04x\n", prefix, obj->attr->pcidev.class_id); printf("%s attr PCI id = %04x:%04x\n", prefix, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id); if (obj->attr->pcidev.linkspeed) printf("%s attr PCI linkspeed = %f GB/s\n", prefix, obj->attr->pcidev.linkspeed); break; } switch (obj->attr->bridge.downstream_type) { case HWLOC_OBJ_BRIDGE_HOST: assert(0); case HWLOC_OBJ_BRIDGE_PCI: printf("%s attr bridge downstream type = PCI\n", prefix); printf("%s attr PCI secondary bus = %02x\n", prefix, obj->attr->bridge.downstream.pci.secondary_bus); printf("%s attr PCI subordinate bus = %02x\n", prefix, obj->attr->bridge.downstream.pci.subordinate_bus); break; } break; case HWLOC_OBJ_PCI_DEVICE: printf("%s attr PCI bus id = %04x:%02x:%02x.%01x\n", prefix, obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func); printf("%s attr PCI class = %04x\n", prefix, obj->attr->pcidev.class_id); printf("%s attr PCI id = %04x:%04x\n", prefix, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id); if (obj->attr->pcidev.linkspeed) printf("%s attr PCI linkspeed = %f GB/s\n", prefix, obj->attr->pcidev.linkspeed); break; case HWLOC_OBJ_OS_DEVICE: printf("%s attr osdev type = %s\n", prefix, type); break; default: /* nothing to show */ break; } for(i=0; i<obj->infos_count; i++) { printf("%s info %s = %s\n", prefix, obj->infos[i].name, obj->infos[i].value); } }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else uint32_t i; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t pfirst,plast; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; pid_t pid = job->envtp->task_pid; cpu_bind_type_t bind_type; int verbose; hwloc_topology_t topology; #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_t cpuset,ct; #else hwloc_bitmap_t cpuset,ct; #endif hwloc_obj_t obj; struct hwloc_obj *pobj; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int hwdepth; size_t tssize; cpu_set_t ts; bind_type = job->cpu_bind_type ; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) verbose = 1 ; if (bind_type & CPU_BIND_NONE) { if (verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = HWLOC_OBJ_SOCKET; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else { if (verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); #if HWLOC_API_VERSION <= 0x00010000 cpuset = hwloc_cpuset_alloc() ; #else cpuset = hwloc_bitmap_alloc() ; #endif /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ hwloc_topology_load(topology); npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = HWLOC_OBJ_SOCKET; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & CPU_BIND_TO_LDOMS) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * Perform a block binding on the detected object respecting the * granularity. * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else { if (verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ pfirst = taskid * job->cpus_per_task ; plast = pfirst + job->cpus_per_task - 1; } else { /* sockets or ldoms granularity */ pfirst = taskid; plast = pfirst; } hwdepth = hwloc_get_type_depth(topology,hwtype); for (i = pfirst; i <= plast && i < nobj ; i++) { obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i); /* if requested binding overlap the granularity */ /* use the ancestor cpuset instead of the object one */ if (hwloc_compare_types(hwtype,req_hwtype) > 0) { /* Get the parent object of req_hwtype or the */ /* one just above if not found (meaning of >0)*/ /* (useful for ldoms binding with !NUMA nodes)*/ pobj = obj->parent; while (pobj != NULL && hwloc_compare_types(pobj->type, req_hwtype) > 0) pobj = pobj->parent; if (pobj != NULL) { if (verbose) info("task/cgroup: task[%u] " "higher level %s found", taskid, hwloc_obj_type_string( pobj->type)); #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(pobj-> allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(pobj-> allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } else { /* should not be executed */ if (verbose) info("task/cgroup: task[%u] " "no higher level found", taskid); #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(obj-> allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(obj-> allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } } else { #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(obj->allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(obj->allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } } char *str; #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_asprintf(&str,cpuset); #else hwloc_bitmap_asprintf(&str,cpuset); #endif tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if (sched_setaffinity(pid,tssize,&ts)) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_free(cpuset); #else hwloc_bitmap_free(cpuset); #endif hwloc_topology_destroy(topology); return fstatus; #endif }
static void look_rset(int sdl, hwloc_obj_type_t type, struct hwloc_topology *topology, int level) { rsethandle_t rset, rad; int i,maxcpus,j; int nbnodes; struct hwloc_obj *obj; if ((topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) rset = rs_alloc(RS_ALL); else rset = rs_alloc(RS_PARTITION); rad = rs_alloc(RS_EMPTY); nbnodes = rs_numrads(rset, sdl, 0); if (nbnodes == -1) { perror("rs_numrads"); return; } for (i = 0; i < nbnodes; i++) { hwloc_bitmap_t cpuset; unsigned os_index = (unsigned) -1; /* no os_index except for PU and NUMANODE below */ if (rs_getrad(rset, rad, sdl, i, 0)) { fprintf(stderr,"rs_getrad(%d) failed: %s\n", i, strerror(errno)); continue; } if (!rs_getinfo(rad, R_NUMPROCS, 0)) continue; maxcpus = rs_getinfo(rad, R_MAXPROCS, 0); cpuset = hwloc_bitmap_alloc(); for (j = 0; j < maxcpus; j++) { if (rs_op(RS_TESTRESOURCE, rad, NULL, R_PROCS, j)) hwloc_bitmap_set(cpuset, j); } if (type == HWLOC_OBJ_PU) { os_index = hwloc_bitmap_first(cpuset); hwloc_debug("Found PU #%u inside node %d for sdl %d\n", os_index, i, sdl); assert(hwloc_bitmap_weight(cpuset) == 1); } else if (type == HWLOC_OBJ_NUMANODE) { /* NUMA node os_index isn't used for binding, just use the rad number to get unique values. * Note that we'll use that fact in hwloc_aix_prepare_membind(). */ os_index = i; hwloc_debug("Using os_index #%u for NUMA node inside node %d for sdl %d\n", os_index, i, sdl); } obj = hwloc_alloc_setup_object(type, os_index); obj->cpuset = cpuset; obj->os_level = sdl; switch(type) { case HWLOC_OBJ_NUMANODE: obj->nodeset = hwloc_bitmap_alloc(); hwloc_bitmap_set(obj->nodeset, i); obj->memory.local_memory = 0; /* TODO: odd, rs_getinfo(rad, R_MEMSIZE, 0) << 10 returns the total memory ... */ obj->memory.page_types_len = 2; obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types)); memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types)); obj->memory.page_types[0].size = hwloc_getpagesize(); #ifdef HAVE__SC_LARGE_PAGESIZE obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE); #endif /* TODO: obj->memory.page_types[1].count = rs_getinfo(rset, R_LGPGFREE, 0) / hugepagesize */ break; case HWLOC_OBJ_CACHE: obj->attr->cache.size = _system_configuration.L2_cache_size; obj->attr->cache.associativity = _system_configuration.L2_cache_asc; obj->attr->cache.linesize = 0; /* unknown by default */ if (__power_pc()) if (__power_4() || __power_5() || __power_6() || __power_7()) obj->attr->cache.linesize = 128; obj->attr->cache.depth = 2; obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; /* OK for power[4567], unknown for others */ break; case HWLOC_OBJ_GROUP: obj->attr->group.depth = level; break; case HWLOC_OBJ_CORE: { hwloc_obj_t obj2, obj3; obj2 = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, i); obj2->cpuset = hwloc_bitmap_dup(obj->cpuset); obj2->attr->cache.size = _system_configuration.dcache_size; obj2->attr->cache.associativity = _system_configuration.dcache_asc; obj2->attr->cache.linesize = _system_configuration.dcache_line; obj2->attr->cache.depth = 1; if (_system_configuration.cache_attrib & (1<<30)) { /* Unified cache */ obj2->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; hwloc_debug("Adding an L1u cache for core %d\n", i); hwloc_insert_object_by_cpuset(topology, obj2); } else { /* Separate Instruction and Data caches */ obj2->attr->cache.type = HWLOC_OBJ_CACHE_DATA; hwloc_debug("Adding an L1d cache for core %d\n", i); hwloc_insert_object_by_cpuset(topology, obj2); obj3 = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, i); obj3->cpuset = hwloc_bitmap_dup(obj->cpuset); obj3->attr->cache.size = _system_configuration.icache_size; obj3->attr->cache.associativity = _system_configuration.icache_asc; obj3->attr->cache.linesize = _system_configuration.icache_line; obj3->attr->cache.depth = 1; obj3->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION; hwloc_debug("Adding an L1i cache for core %d\n", i); hwloc_insert_object_by_cpuset(topology, obj3); } break; } default: break; } hwloc_debug_2args_bitmap("%s %d has cpuset %s\n", hwloc_obj_type_string(type), i, obj->cpuset); hwloc_insert_object_by_cpuset(topology, obj); } rs_free(rset); rs_free(rad); }
int orte_ess_base_proc_binding(void) { hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; char *map; int ret; char *error=NULL; hwloc_cpuset_t mycpus; /* Determine if we were pre-bound or not */ if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) { orte_proc_applied_binding = hwloc_bitmap_alloc(); if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) { error = "applied_binding parse"; goto error; } } } /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL == opal_hwloc_topology) { /* there is nothing we can do, so just return */ return ORTE_SUCCESS; } support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node); /* get our bindings */ cpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the * environment does not support it */ hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Binding not supported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, * or if there is only ONE cpu available to us */ if (0 != hwloc_bitmap_compare(cpus, nodeset) || opal_hwloc_base_single_cpu(nodeset) || opal_hwloc_base_single_cpu(cpus)) { /* someone external set it - indicate it is set * so that we know */ orte_proc_is_bound = true; hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process was externally bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ hwloc_bitmap_zero(cpus); if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list, opal_hwloc_topology, OPAL_HWLOC_LOGICAL, cpus))) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound according to slot_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound */ OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process not bound - no node rank available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th * hwthread on this node */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; ret = ORTE_ERROR; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 1; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 2; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 3; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_SOCKET; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_NODE; } else { ret = ORTE_ERR_NOT_FOUND; error = "Binding policy not known"; goto error; } for (obj = obj->parent; NULL != obj; obj = obj->parent) { if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* this is the place! */ cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target))); break; } } if (!orte_proc_is_bound) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } } } } } else { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: /* get or update our local cpuset - it will get used multiple * times, so it's more efficient to keep a global copy */ opal_hwloc_base_get_local_cpuset(); /* get the cpus we are bound to */ mycpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS) < 0) { if (NULL != orte_process_info.cpuset) { free(orte_process_info.cpuset); orte_process_info.cpuset = NULL; } if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "MCW rank %d is not bound", ORTE_PROC_MY_NAME->vpid); } } else { /* store/update the string representation of our local binding */ if (NULL != orte_process_info.cpuset) { free(orte_process_info.cpuset); orte_process_info.cpuset = NULL; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, mycpus); /* report the binding, if requested */ if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { char tmp1[1024], tmp2[1024]; if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), opal_hwloc_topology, mycpus)) { opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", ORTE_PROC_MY_NAME->vpid); } else { opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), opal_hwloc_topology, mycpus); opal_output(0, "MCW rank %d bound to %s: %s", ORTE_PROC_MY_NAME->vpid, tmp1, tmp2); } } } hwloc_bitmap_free(mycpus); /* push our cpuset so others can calculate our locality */ if (NULL != orte_process_info.cpuset) { OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_CPUSET, orte_process_info.cpuset, OPAL_STRING); } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; }
int main(int argc, char *argv[]) { netloc_map_t map; netloc_map_server_t srcserver, dstserver; hwloc_topology_t srctopo, dsttopo; hwloc_obj_t srcobj, dstobj; netloc_map_paths_t paths; unsigned nr_paths, nr_edges, i, j; struct netloc_map_edge_s *edges; unsigned flags = 0x3; char *path; int err; if (argc > 2) { if (!strcmp(argv[1], "--flags")) { flags = (unsigned) strtoul(argv[2], NULL, 0); argc -= 2; argv += 2; } } if (argc < 6) { fprintf(stderr, "%s [options] <datadir> <srcserver> <srcpu> <dstserver> <dstpu>\n", argv[0]); fprintf(stderr, "Example: %s mynetlocdata server2 1 server3 7\n", argv[0]); fprintf(stderr, " Loads netloc map from 'mynetlocdata' directory and display all paths\n"); fprintf(stderr, " from server 'server2' PU #1 to server 'server3' PU #7.\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " --flags N Use value N as map paths flags. Default is 3 which displays all edges.\n"); exit(EXIT_FAILURE); } err = netloc_map_create(&map); if (err) { fprintf(stderr, "Failed to create the map\n"); exit(EXIT_FAILURE); } asprintf(&path, "%s/hwloc", argv[1]); err = netloc_map_load_hwloc_data(map, path); free(path); if (err) { fprintf(stderr, "Failed to load hwloc data\n"); return -1; } asprintf(&path, "file://%s/netloc", argv[1]); err = netloc_map_load_netloc_data(map, path); free(path); if (err) { fprintf(stderr, "Failed to load netloc data\n"); return -1; } err = netloc_map_build(map, 0); if (err) { fprintf(stderr, "Failed to build map data\n"); return -1; } err = netloc_map_name2server(map, argv[2], &srcserver); if (err) { fprintf(stderr, "Could not find src server %s\n", argv[2]); return -1; } err = netloc_map_server2hwloc(srcserver, &srctopo); if (err) { fprintf(stderr, "Could not find src server %s hwloc topology\n", argv[2]); return -1; } srcobj = hwloc_get_obj_by_type(srctopo, HWLOC_OBJ_PU, atoi(argv[3])); if (!srcobj) { fprintf(stderr, "Could not find src server %s PU #%s\n", argv[2], argv[3]); return -1; } err = netloc_map_name2server(map, argv[4], &dstserver); if (err) { fprintf(stderr, "Could not find dst server %s\n", argv[4]); return -1; } err = netloc_map_server2hwloc(dstserver, &dsttopo); if (err) { fprintf(stderr, "Could not find dst server %s hwloc topology\n", argv[4]); return -1; } dstobj = hwloc_get_obj_by_type(dsttopo, HWLOC_OBJ_PU, atoi(argv[5])); if (!dstobj) { fprintf(stderr, "Could not find dst server %s PU #%s\n", argv[4], argv[5]); return -1; } err = netloc_map_paths_build(map, srctopo, srcobj, dsttopo, dstobj, flags, &paths, &nr_paths); if (err < 0) { fprintf(stderr, "Failed to build paths\n"); return -1; } printf("got %u paths\n", nr_paths); for(i=0; i<nr_paths; i++) { printf(" path #%u:\n", i); err = netloc_map_paths_get(paths, i, &edges, &nr_edges); assert(!err); printf(" %u edges\n", nr_edges); for(j=0; j<nr_edges; j++) { struct netloc_map_edge_s *edge = &edges[j]; printf(" edge #%u type %d: ", j, edge->type); switch (edge->type) { case NETLOC_MAP_EDGE_TYPE_NETLOC: printf("netloc from %s to %s in subnet type %s id %s\n", edge->netloc.edge->src_node_id, edge->netloc.edge->dest_node_id, netloc_decode_network_type(netloc_access_network_ref(edge->netloc.topology)->network_type), netloc_access_network_ref(edge->netloc.topology)->subnet_id); break; case NETLOC_MAP_EDGE_TYPE_HWLOC_PARENT: printf("hwloc UP from %s:%u (%s) to parent %s:%u (%s) weight %u\n", hwloc_obj_type_string(edge->hwloc.src_obj->type), edge->hwloc.src_obj->logical_index, edge->hwloc.src_obj->name ? : "<unnamed>", hwloc_obj_type_string(edge->hwloc.dest_obj->type), edge->hwloc.dest_obj->logical_index, edge->hwloc.dest_obj->name ? : "<unnamed>", edge->hwloc.weight); break; case NETLOC_MAP_EDGE_TYPE_HWLOC_HORIZONTAL: printf("hwloc HORIZONTAL from %s:%u (%s) to cousin %s:%u (%s) weight %u\n", hwloc_obj_type_string(edge->hwloc.src_obj->type), edge->hwloc.src_obj->logical_index, edge->hwloc.src_obj->name ? : "<unnamed>", hwloc_obj_type_string(edge->hwloc.dest_obj->type), edge->hwloc.dest_obj->logical_index, edge->hwloc.dest_obj->name ? : "<unnamed>", edge->hwloc.weight); break; case NETLOC_MAP_EDGE_TYPE_HWLOC_CHILD: printf("hwloc DOWN from %s:%u (%s) to child %s:%u (%s) weight %u\n", hwloc_obj_type_string(edge->hwloc.src_obj->type), edge->hwloc.src_obj->logical_index, edge->hwloc.src_obj->name ? : "<unnamed>", hwloc_obj_type_string(edge->hwloc.dest_obj->type), edge->hwloc.dest_obj->logical_index, edge->hwloc.dest_obj->name ? : "<unnamed>", edge->hwloc.weight); break; case NETLOC_MAP_EDGE_TYPE_HWLOC_PCI: printf("hwloc PCI from %s:%u (%s) to child %s:%u (%s) weight %u\n", hwloc_obj_type_string(edge->hwloc.src_obj->type), edge->hwloc.src_obj->logical_index, edge->hwloc.src_obj->name ? : "<unnamed>", hwloc_obj_type_string(edge->hwloc.dest_obj->type), edge->hwloc.dest_obj->logical_index, edge->hwloc.dest_obj->name ? : "<unnamed>", edge->hwloc.weight); break; } } } netloc_map_paths_destroy(paths); netloc_map_put_hwloc(map, srctopo); netloc_map_put_hwloc(map, dsttopo); netloc_map_destroy(map); return 0; }
void output_console(struct lstopo_output *loutput, const char *filename) { hwloc_topology_t topology = loutput->topology; unsigned topodepth; int verbose_mode = loutput->verbose_mode; int logical = loutput->logical; FILE *output; output = open_output(filename, loutput->overwrite); if (!output) { fprintf(stderr, "Failed to open %s for writing (%s)\n", filename, strerror(errno)); return; } topodepth = hwloc_topology_get_depth(topology); /* * if verbose_mode == 0, only print the summary. * if verbose_mode == 1, only print the topology tree. * if verbose_mode > 1, print both. */ if (lstopo_show_only != (hwloc_obj_type_t)-1) { if (verbose_mode > 1) fprintf(output, "Only showing %s objects\n", hwloc_obj_type_string(lstopo_show_only)); output_only (topology, hwloc_get_root_obj(topology), output, logical, verbose_mode); } else if (verbose_mode >= 1) { output_topology (topology, hwloc_get_root_obj(topology), NULL, output, 0, logical, verbose_mode); fprintf(output, "\n"); } if ((verbose_mode > 1 || !verbose_mode) && lstopo_show_only == (hwloc_obj_type_t)-1) { hwloc_lstopo_show_summary(output, topology); } if (verbose_mode > 1 && lstopo_show_only == (hwloc_obj_type_t)-1) { const struct hwloc_distances_s * distances; unsigned depth; for (depth = 0; depth < topodepth; depth++) { distances = hwloc_get_whole_distance_matrix_by_depth(topology, depth); if (!distances || !distances->latency) continue; fprintf(output, "relative latency matrix between %ss (depth %u) by %s indexes:\n", hwloc_obj_type_string(hwloc_get_depth_type(topology, depth)), depth, logical ? "logical" : "physical"); hwloc_utils_print_distance_matrix(output, topology, hwloc_get_root_obj(topology), distances->nbobjs, depth, distances->latency, logical); } } if (verbose_mode > 1 && lstopo_show_only == (hwloc_obj_type_t)-1) { hwloc_const_bitmap_t complete = hwloc_topology_get_complete_cpuset(topology); hwloc_const_bitmap_t topo = hwloc_topology_get_topology_cpuset(topology); hwloc_const_bitmap_t allowed = hwloc_topology_get_allowed_cpuset(topology); if (!hwloc_bitmap_isequal(topo, complete)) { hwloc_bitmap_t unknown = hwloc_bitmap_alloc(); char *unknownstr; hwloc_bitmap_copy(unknown, complete); hwloc_bitmap_andnot(unknown, unknown, topo); hwloc_bitmap_asprintf(&unknownstr, unknown); fprintf (output, "%d processors not represented in topology: %s\n", hwloc_bitmap_weight(unknown), unknownstr); free(unknownstr); hwloc_bitmap_free(unknown); } if (!hwloc_bitmap_isequal(topo, allowed)) { hwloc_bitmap_t disallowed = hwloc_bitmap_alloc(); char *disallowedstr; hwloc_bitmap_copy(disallowed, topo); hwloc_bitmap_andnot(disallowed, disallowed, allowed); hwloc_bitmap_asprintf(&disallowedstr, disallowed); fprintf(output, "%d processors represented but not allowed: %s\n", hwloc_bitmap_weight(disallowed), disallowedstr); free(disallowedstr); hwloc_bitmap_free(disallowed); } if (!hwloc_topology_is_thissystem(topology)) fprintf (output, "Topology not from this system\n"); } if (output != stdout) fclose(output); }
static int bind_upwards(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node upwards * until we find an object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_obj_t obj; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind upwards for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(opal_hwloc_binding_policy)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* clear the topology of any prior usage numbers */ opal_hwloc_base_clear_usage(node->topology); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* bozo check */ if (NULL == proc->locale) { opal_output(0, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* starting at the locale, move up thru the parents * to find the target object type */ for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) { opal_output(0, "%s bind:upward target %s type %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target), hwloc_obj_type_string(obj->type)); if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* get its index */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) { return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)obj->userdata; data->num_bound++; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(target), idx, node->name); break; } } if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) { /* didn't find anyone to bind to - this is an error * unless the user specified if-supported */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true, opal_hwloc_base_print_binding(map->binding), node->name); return ORTE_ERR_SILENT; } } } return ORTE_SUCCESS; }
static void hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *data, unsigned curleveldepth, int verbose) { struct hwloc_synthetic_level_data_s *curlevel = &data->level[curleveldepth]; unsigned long total = curlevel->totalwidth; const char *attr = curlevel->index_string; unsigned long length = curlevel->index_string_length; unsigned *array = NULL; struct hwloc_synthetic_intlv_loop_s * loops = NULL; size_t i; if (!attr) return; array = calloc(total, sizeof(*array)); if (!array) { if (verbose) fprintf(stderr, "Failed to allocate synthetic index array of size %lu\n", total); goto out; } i = strspn(attr, "0123456789,"); if (i == length) { /* explicit array of indexes */ for(i=0; i<total; i++) { const char *next; unsigned idx = strtoul(attr, (char **) &next, 10); if (next == attr) { if (verbose) fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", (unsigned long) i, attr); goto out_with_array; } array[i] = idx; if (i != total-1) { if (*next != ',') { if (verbose) fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", (unsigned long) i, attr); goto out_with_array; } attr = next+1; } else { attr = next; } } curlevel->index_array = array; } else { /* interleaving */ unsigned nr_loops = 1, cur_loop; unsigned minstep = total; unsigned long nbs = 1; unsigned j, mul; const char *tmp; tmp = attr; while (tmp) { tmp = strchr(tmp, ':'); if (!tmp || tmp >= attr+length) break; nr_loops++; tmp++; } /* nr_loops colon-separated fields, but we may need one more at the end */ loops = malloc((nr_loops+1)*sizeof(*loops)); if (!loops) { if (verbose) fprintf(stderr, "Failed to allocate synthetic index interleave loop array of size %u\n", nr_loops); goto out_with_array; } if (*attr >= '0' && *attr <= '9') { /* interleaving as x*y:z*t:... */ unsigned step, nb; tmp = attr; cur_loop = 0; while (tmp) { char *tmp2, *tmp3; step = (unsigned) strtol(tmp, &tmp2, 0); if (tmp2 == tmp || *tmp2 != '*') { if (verbose) fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number before '*'\n", tmp); goto out_with_loops; } if (!step) { if (verbose) fprintf(stderr, "Invalid interleaving loop with step 0 at '%s'\n", tmp); goto out_with_loops; } tmp2++; nb = (unsigned) strtol(tmp2, &tmp3, 0); if (tmp3 == tmp2 || (*tmp3 && *tmp3 != ':' && *tmp3 != ')' && *tmp3 != ' ')) { if (verbose) fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number between '*' and ':'\n", tmp); goto out_with_loops; } if (!nb) { if (verbose) fprintf(stderr, "Invalid interleaving loop with number 0 at '%s'\n", tmp2); goto out_with_loops; } loops[cur_loop].step = step; loops[cur_loop].nb = nb; if (step < minstep) minstep = step; nbs *= nb; cur_loop++; if (*tmp3 == ')' || *tmp3 == ' ') break; tmp = (const char*) (tmp3+1); } } else { /* interleaving as type1:type2:... */ hwloc_obj_type_t type; hwloc_obj_cache_type_t cachetypeattr; int depthattr; int err; /* find level depths for each interleaving loop */ tmp = attr; cur_loop = 0; while (tmp) { err = hwloc_obj_type_sscanf(tmp, &type, &depthattr, &cachetypeattr, sizeof(cachetypeattr)); if (err < 0) { if (verbose) fprintf(stderr, "Failed to read synthetic index interleaving loop type '%s'\n", tmp); goto out_with_loops; } if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) { if (verbose) fprintf(stderr, "Misc object type disallowed in synthetic index interleaving loop type '%s'\n", tmp); goto out_with_loops; } for(i=0; i<curleveldepth; i++) { if (type != data->level[i].type) continue; if ((type == HWLOC_OBJ_GROUP || type == HWLOC_OBJ_CACHE) && depthattr != -1 && (unsigned) depthattr != data->level[i].depth) continue; if (type == HWLOC_OBJ_CACHE && cachetypeattr != (hwloc_obj_cache_type_t) -1 && cachetypeattr != data->level[i].cachetype) continue; loops[cur_loop].level_depth = (unsigned)i; break; } if (i == curleveldepth) { if (verbose) fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s' above '%s'\n", tmp, hwloc_obj_type_string(curlevel->type)); goto out_with_loops; } tmp = strchr(tmp, ':'); if (!tmp || tmp > attr+length) break; tmp++; cur_loop++; } /* compute actual loop step/nb */ for(cur_loop=0; cur_loop<nr_loops; cur_loop++) { unsigned mydepth = loops[cur_loop].level_depth; unsigned prevdepth = 0; unsigned step, nb; for(i=0; i<nr_loops; i++) { if (loops[i].level_depth == mydepth && i != cur_loop) { if (verbose) fprintf(stderr, "Invalid duplicate interleaving loop type in synthetic index '%s'\n", attr); goto out_with_loops; } if (loops[i].level_depth < mydepth && loops[i].level_depth > prevdepth) prevdepth = loops[i].level_depth; } step = curlevel->totalwidth / data->level[mydepth].totalwidth; /* number of objects below us */ nb = data->level[mydepth].totalwidth / data->level[prevdepth].totalwidth; /* number of us within parent */ loops[cur_loop].step = step; loops[cur_loop].nb = nb; assert(nb); assert(step); if (step < minstep) minstep = step; nbs *= nb; } } assert(nbs); if (nbs != total) { /* one loop of total/nbs steps is missing, add it if it's just the smallest one */ if (minstep == total/nbs) { loops[nr_loops].step = 1; loops[nr_loops].nb = total/nbs; nr_loops++; } else { if (verbose) fprintf(stderr, "Invalid index interleaving total width %lu instead of %lu\n", nbs, total); goto out_with_loops; } } /* generate the array of indexes */ mul = 1; for(i=0; i<nr_loops; i++) { unsigned step = loops[i].step; unsigned nb = loops[i].nb; for(j=0; j<total; j++) array[j] += ((j / step) % nb) * mul; mul *= nb; } /* check that we have the right values (cannot pass total, cannot give duplicate 0) */ for(j=0; j<total; j++) { if (array[j] >= total) { if (verbose) fprintf(stderr, "Invalid index interleaving generates out-of-range index %u\n", array[j]); goto out_with_loops; } if (!array[j] && j) { if (verbose) fprintf(stderr, "Invalid index interleaving generates duplicate index values\n"); goto out_with_loops; } } free(loops); curlevel->index_array = array; } return; out_with_loops: free(loops); out_with_array: free(array); out: return; }
static int bind_upwards(orte_job_t *jdata, orte_node_t *node, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node upwards * until we find an object of type target - and then bind * the process to that target */ int j; orte_job_map_t *map; orte_proc_t *proc; hwloc_obj_t obj; hwloc_cpuset_t cpus; unsigned int idx, ncpus; opal_hwloc_obj_data_t *data; hwloc_obj_t locale; char *cpu_bitmap; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind upwards for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* bozo check */ if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* starting at the locale, move up thru the parents * to find the target object type */ cpu_bitmap = NULL; for (obj = locale->parent; NULL != obj; obj = obj->parent) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind:upward target %s type %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target), hwloc_obj_type_string(obj->type)); if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* get its index */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)obj->userdata; data->num_bound++; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* if the user specified a binding policy, then we cannot meet * it since overload isn't allowed, so error out - have the * message indicate that setting overload allowed will remove * this restriction */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } else { /* if we have the default binding policy, then just don't bind */ OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); unbind_procs(jdata); return ORTE_SUCCESS; } } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus); orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); /* record the location */ orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, obj, OPAL_PTR); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), cpu_bitmap, hwloc_obj_type_string(target), idx, node->name); break; } } if (NULL == cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) { /* didn't find anyone to bind to - this is an error * unless the user specified if-supported */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true, opal_hwloc_base_print_binding(map->binding), node->name); return ORTE_ERR_SILENT; } if (NULL != cpu_bitmap) { free(cpu_bitmap); } } return ORTE_SUCCESS; }
static int bind_upwards(orte_job_t *jdata, orte_node_t *node, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node upwards * until we find an object of type target - and then bind * the process to that target */ int j; orte_job_map_t *map; orte_proc_t *proc; hwloc_obj_t obj; hwloc_cpuset_t cpus; unsigned int idx, ncpus; opal_hwloc_obj_data_t *data; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind upwards for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* ignore procs that have already been bound - should * never happen, but safer */ if (NULL != proc->cpu_bitmap) { continue; } /* bozo check */ if (NULL == proc->locale) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BIND UPWARDS: LOCALE FOR PROC %s IS NULL", ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* starting at the locale, move up thru the parents * to find the target object type */ for (obj = proc->locale->parent; NULL != obj; obj = obj->parent) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind:upward target %s type %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target), hwloc_obj_type_string(obj->type)); if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* get its index */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, obj, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } /* track the number bound */ data = (opal_hwloc_obj_data_t*)obj->userdata; data->num_bound++; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, obj))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* error out if adding a proc would cause overload and that wasn't allowed, * and it wasn't a default binding policy (i.e., the user requested it) */ if (ncpus < data->num_bound && !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding) && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } /* bind it here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, obj); hwloc_bitmap_list_asprintf(&proc->cpu_bitmap, cpus); /* record the location */ proc->bind_location = obj; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), proc->cpu_bitmap, hwloc_obj_type_string(target), idx, node->name); break; } } if (NULL == proc->cpu_bitmap && OPAL_BINDING_REQUIRED(jdata->map->binding)) { /* didn't find anyone to bind to - this is an error * unless the user specified if-supported */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-target-not-found", true, opal_hwloc_base_print_binding(map->binding), node->name); return ORTE_ERR_SILENT; } } return ORTE_SUCCESS; }
static int bind_in_place(orte_job_t *jdata, hwloc_obj_type_t target, unsigned cache_level) { /* traverse the hwloc topology tree on each node downwards * until we find an unused object of type target - and then bind * the process to that target */ int i, j; orte_job_map_t *map; orte_node_t *node; orte_proc_t *proc; hwloc_cpuset_t cpus; unsigned int idx, ncpus; struct hwloc_topology_support *support; opal_hwloc_obj_data_t *data; hwloc_obj_t locale, sib; char *cpu_bitmap; bool found; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bind in place for job %s with bindings %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); /* initialize */ map = jdata->map; for (i=0; i < map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(map->binding) || !OPAL_BINDING_POLICY_IS_SET(map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); /* cycle thru the procs */ for (j=0; j < node->procs->size; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) { continue; } /* ignore procs from other jobs */ if (proc->name.jobid != jdata->jobid) { continue; } /* bozo check */ if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-locale", true, ORTE_NAME_PRINT(&proc->name)); return ORTE_ERR_SILENT; } /* get the index of this location */ if (UINT_MAX == (idx = opal_hwloc_base_get_obj_idx(node->topology, locale, OPAL_HWLOC_AVAILABLE))) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_SILENT; } data = (opal_hwloc_obj_data_t*)locale->userdata; /* get the number of cpus under this location */ if (0 == (ncpus = opal_hwloc_base_get_npus(node->topology, locale))) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-available-cpus", true, node->name); return ORTE_ERR_SILENT; } /* if we don't have enough cpus to support this additional proc, try * shifting the location to a cousin that can support it - the important * thing is that we maintain the same level in the topology */ if (ncpus < (data->num_bound+1)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_in_place: searching right", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; found = false; while (NULL != (sib = sib->next_cousin)) { data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology, sib); if (data->num_bound < ncpus) { found = true; locale = sib; break; } } if (!found) { /* try the other direction */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_in_place: searching left", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); sib = locale; while (NULL != (sib = sib->prev_cousin)) { data = (opal_hwloc_obj_data_t*)sib->userdata; ncpus = opal_hwloc_base_get_npus(node->topology, sib); if (data->num_bound < ncpus) { found = true; locale = sib; break; } } } if (!found) { /* no place to put this - see if overload is allowed */ if (!OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)) { if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* if the user specified a binding policy, then we cannot meet * it since overload isn't allowed, so error out - have the * message indicate that setting overload allowed will remove * this restriction */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:binding-overload", true, opal_hwloc_base_print_binding(map->binding), node->name, data->num_bound, ncpus); return ORTE_ERR_SILENT; } else { /* if we have the default binding policy, then just don't bind */ OPAL_SET_BINDING_POLICY(map->binding, OPAL_BIND_TO_NONE); unbind_procs(jdata); return ORTE_SUCCESS; } } } } /* track the number bound */ data = (opal_hwloc_obj_data_t*)locale->userdata; // just in case it changed data->num_bound++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "BINDING PROC %s TO %s NUMBER %u", ORTE_NAME_PRINT(&proc->name), hwloc_obj_type_string(locale->type), idx); /* bind the proc here */ cpus = opal_hwloc_base_get_available_cpus(node->topology, locale); hwloc_bitmap_list_asprintf(&cpu_bitmap, cpus); orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); /* update the location, in case it changed */ orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND, ORTE_ATTR_LOCAL, locale, OPAL_PTR); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s BOUND PROC %s TO %s[%s:%u] on node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), cpu_bitmap, hwloc_obj_type_string(locale->type), idx, node->name); if (NULL != cpu_bitmap) { free(cpu_bitmap); } } } return ORTE_SUCCESS; }
int orte_rmaps_base_compute_bindings(orte_job_t *jdata) { hwloc_obj_type_t hwb, hwm; unsigned clvl=0, clvm=0; opal_binding_policy_t bind; orte_mapping_policy_t map; orte_node_t *node; int i, rc; struct hwloc_topology_support *support; bool force_down = false; hwloc_cpuset_t totalcpuset; int bind_depth, map_depth; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: compute bindings for job %s with policy %s", ORTE_JOBID_PRINT(jdata->jobid), opal_hwloc_base_print_binding(jdata->map->binding)); map = ORTE_GET_MAPPING_POLICY(jdata->map->mapping); bind = OPAL_GET_BINDING_POLICY(jdata->map->binding); if (ORTE_MAPPING_BYUSER == map) { /* user specified binding by rankfile - nothing for us to do */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_CPUSET == bind) { int rc; /* cpuset was given - setup the bindings */ if (ORTE_SUCCESS != (rc = bind_to_cpuset(jdata))) { ORTE_ERROR_LOG(rc); } return rc; } if (OPAL_BIND_TO_NONE == bind) { /* no binding requested */ return ORTE_SUCCESS; } if (OPAL_BIND_TO_BOARD == bind) { /* doesn't do anything at this time */ return ORTE_SUCCESS; } /* binding requested - convert the binding level to the hwloc obj type */ switch (bind) { case OPAL_BIND_TO_NUMA: hwb = HWLOC_OBJ_NODE; break; case OPAL_BIND_TO_SOCKET: hwb = HWLOC_OBJ_SOCKET; break; case OPAL_BIND_TO_L3CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 3; break; case OPAL_BIND_TO_L2CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 2; break; case OPAL_BIND_TO_L1CACHE: hwb = HWLOC_OBJ_CACHE; clvl = 1; break; case OPAL_BIND_TO_CORE: hwb = HWLOC_OBJ_CORE; break; case OPAL_BIND_TO_HWTHREAD: hwb = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* do the same for the mapping policy */ switch (map) { case ORTE_MAPPING_BYNODE: case ORTE_MAPPING_BYSLOT: case ORTE_MAPPING_SEQ: hwm = HWLOC_OBJ_MACHINE; break; case ORTE_MAPPING_BYDIST: case ORTE_MAPPING_BYNUMA: hwm = HWLOC_OBJ_NODE; break; case ORTE_MAPPING_BYSOCKET: hwm = HWLOC_OBJ_SOCKET; break; case ORTE_MAPPING_BYL3CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 3; break; case ORTE_MAPPING_BYL2CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 2; break; case ORTE_MAPPING_BYL1CACHE: hwm = HWLOC_OBJ_CACHE; clvm = 1; break; case ORTE_MAPPING_BYCORE: hwm = HWLOC_OBJ_CORE; break; case ORTE_MAPPING_BYHWTHREAD: hwm = HWLOC_OBJ_PU; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } /* if the job was mapped by the corresponding target, then * we bind in place * * otherwise, we have to bind either up or down the hwloc * tree. If we are binding upwards (e.g., mapped to hwthread * but binding to core), then we just climb the tree to find * the first matching object. * * if we are binding downwards (e.g., mapped to node and bind * to core), then we have to do a round-robin assigment of * procs to the resources below. */ if (ORTE_MAPPING_BYDIST == map) { int rc = ORTE_SUCCESS; if (OPAL_BIND_TO_NUMA == bind) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - dist to numa", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, HWLOC_OBJ_NODE, 0))) { ORTE_ERROR_LOG(rc); } } else if (OPAL_BIND_TO_NUMA < bind) { /* bind every proc downwards */ force_down = true; goto execute; } /* if the binding policy is less than numa, then we are unbound - so * just ignore this and return (should have been caught in prior * tests anyway as only options meeting that criteria are "none" * and "board") */ return rc; } /* now deal with the remaining binding policies based on hardware */ if (bind == map) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps: bindings for job %s - bind in place", ORTE_JOBID_PRINT(jdata->jobid)); if (ORTE_SUCCESS != (rc = bind_in_place(jdata, hwb, clvl))) { ORTE_ERROR_LOG(rc); } return rc; } /* we need to handle the remaining binding options on a per-node * basis because different nodes could potentially have different * topologies, with different relative depths for the two levels */ execute: /* initialize */ totalcpuset = hwloc_bitmap_alloc(); for (i=0; i < jdata->map->nodes->size; i++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) { continue; } if (!orte_do_not_launch) { /* if we don't want to launch, then we are just testing the system, * so ignore questions about support capabilities */ support = (struct hwloc_topology_support*)hwloc_topology_get_support(node->topology); /* check if topology supports cpubind - have to be careful here * as Linux doesn't currently support thread-level binding. This * may change in the future, though, and it isn't clear how hwloc * interprets the current behavior. So check both flags to be sure. */ if (!support->cpubind->set_thisproc_cpubind && !support->cpubind->set_thisthread_cpubind) { if (!OPAL_BINDING_REQUIRED(jdata->map->binding) || !OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { /* we are not required to bind, so ignore this */ continue; } orte_show_help("help-orte-rmaps-base.txt", "rmaps:cpubind-not-supported", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } /* check if topology supports membind - have to be careful here * as hwloc treats this differently than I (at least) would have * expected. Per hwloc, Linux memory binding is at the thread, * and not process, level. Thus, hwloc sets the "thisproc" flag * to "false" on all Linux systems, and uses the "thisthread" flag * to indicate binding capability - don't warn if the user didn't * specifically request binding */ if (!support->membind->set_thisproc_membind && !support->membind->set_thisthread_membind && OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) { if (OPAL_HWLOC_BASE_MBFA_WARN == opal_hwloc_base_mbfa && !membind_warned) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported", true, node->name); membind_warned = true; } else if (OPAL_HWLOC_BASE_MBFA_ERROR == opal_hwloc_base_mbfa) { orte_show_help("help-orte-rmaps-base.txt", "rmaps:membind-not-supported-fatal", true, node->name); hwloc_bitmap_free(totalcpuset); return ORTE_ERR_SILENT; } } } /* some systems do not report cores, and so we can get a situation where our * default binding policy will fail for no necessary reason. So if we are * computing a binding due to our default policy, and no cores are found * on this node, just silently skip it - we will not bind */ if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding) && HWLOC_TYPE_DEPTH_UNKNOWN == hwloc_get_type_depth(node->topology, HWLOC_OBJ_CORE)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "Unable to bind-to core by default on node %s as no cores detected", node->name); continue; } /* we share topologies in order * to save space, so we need to reset the usage info to reflect * our own current state */ reset_usage(node, jdata->jobid); if (force_down) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* determine the relative depth on this node */ if (HWLOC_OBJ_CACHE == hwb) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ bind_depth = hwloc_get_cache_type_depth(node->topology, clvl, -1); } else { bind_depth = hwloc_get_type_depth(node->topology, hwb); } if (0 > bind_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwb), node->name); return ORTE_ERR_SILENT; } if (HWLOC_OBJ_CACHE == hwm) { /* must use a unique function because blasted hwloc * just doesn't deal with caches very well...sigh */ map_depth = hwloc_get_cache_type_depth(node->topology, clvm, -1); } else { map_depth = hwloc_get_type_depth(node->topology, hwm); } if (0 > map_depth) { /* didn't find such an object */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-objects", true, hwloc_obj_type_string(hwm), node->name); return ORTE_ERR_SILENT; } opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "%s bind_depth: %d map_depth %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bind_depth, map_depth); if (bind_depth > map_depth) { if (ORTE_SUCCESS != (rc = bind_downwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } else { if (ORTE_SUCCESS != (rc = bind_upwards(jdata, node, hwb, clvl))) { ORTE_ERROR_LOG(rc); return rc; } } } } return ORTE_SUCCESS; }
void output_console(hwloc_topology_t topology, const char *filename, int logical, int legend __hwloc_attribute_unused, int verbose_mode) { unsigned topodepth; FILE *output; if (!filename || !strcmp(filename, "-")) output = stdout; else { output = open_file(filename, "w"); if (!output) { fprintf(stderr, "Failed to open %s for writing (%s)\n", filename, strerror(errno)); return; } } topodepth = hwloc_topology_get_depth(topology); /* * if verbose_mode == 0, only print the summary. * if verbose_mode == 1, only print the topology tree. * if verbose_mode > 1, print both. */ if (lstopo_show_only != (hwloc_obj_type_t)-1) { if (verbose_mode > 1) fprintf(output, "Only showing %s objects\n", hwloc_obj_type_string(lstopo_show_only)); output_only (topology, hwloc_get_root_obj(topology), output, logical, verbose_mode); } else if (verbose_mode >= 1) { output_topology (topology, hwloc_get_root_obj(topology), NULL, output, 0, logical, verbose_mode); fprintf(output, "\n"); } if ((verbose_mode > 1 || !verbose_mode) && lstopo_show_only == (hwloc_obj_type_t)-1) { hwloc_lstopo_show_summary(output, topology); } if (verbose_mode > 1 && lstopo_show_only == (hwloc_obj_type_t)-1) { const struct hwloc_distances_s * distances; unsigned depth; for (depth = 0; depth < topodepth; depth++) { distances = hwloc_get_whole_distance_matrix_by_depth(topology, depth); if (!distances || !distances->latency) continue; printf("latency matrix between %ss (depth %u) by %s indexes:\n", hwloc_obj_type_string(hwloc_get_depth_type(topology, depth)), depth, logical ? "logical" : "physical"); hwloc_utils_print_distance_matrix(topology, hwloc_get_root_obj(topology), distances->nbobjs, depth, distances->latency, logical); } } if (verbose_mode > 1 && lstopo_show_only == (hwloc_obj_type_t)-1) { hwloc_const_bitmap_t complete = hwloc_topology_get_complete_cpuset(topology); hwloc_const_bitmap_t topo = hwloc_topology_get_topology_cpuset(topology); hwloc_const_bitmap_t online = hwloc_topology_get_online_cpuset(topology); hwloc_const_bitmap_t allowed = hwloc_topology_get_allowed_cpuset(topology); if (complete && !hwloc_bitmap_isequal(topo, complete)) { hwloc_bitmap_t unknown = hwloc_bitmap_alloc(); char *unknownstr; hwloc_bitmap_copy(unknown, complete); hwloc_bitmap_andnot(unknown, unknown, topo); hwloc_bitmap_asprintf(&unknownstr, unknown); fprintf (output, "%d processors not represented in topology: %s\n", hwloc_bitmap_weight(unknown), unknownstr); free(unknownstr); hwloc_bitmap_free(unknown); } if (complete && !hwloc_bitmap_isequal(online, complete)) { hwloc_bitmap_t offline = hwloc_bitmap_alloc(); char *offlinestr; hwloc_bitmap_copy(offline, complete); hwloc_bitmap_andnot(offline, offline, online); hwloc_bitmap_asprintf(&offlinestr, offline); fprintf (output, "%d processors offline: %s\n", hwloc_bitmap_weight(offline), offlinestr); free(offlinestr); hwloc_bitmap_free(offline); } if (complete && !hwloc_bitmap_isequal(allowed, online)) { if (!hwloc_bitmap_isincluded(online, allowed)) { hwloc_bitmap_t forbidden = hwloc_bitmap_alloc(); char *forbiddenstr; hwloc_bitmap_copy(forbidden, online); hwloc_bitmap_andnot(forbidden, forbidden, allowed); hwloc_bitmap_asprintf(&forbiddenstr, forbidden); fprintf(output, "%d processors online but not allowed: %s\n", hwloc_bitmap_weight(forbidden), forbiddenstr); free(forbiddenstr); hwloc_bitmap_free(forbidden); } if (!hwloc_bitmap_isincluded(allowed, online)) { hwloc_bitmap_t potential = hwloc_bitmap_alloc(); char *potentialstr; hwloc_bitmap_copy(potential, allowed); hwloc_bitmap_andnot(potential, potential, online); hwloc_bitmap_asprintf(&potentialstr, potential); fprintf(output, "%d processors allowed but not online: %s\n", hwloc_bitmap_weight(potential), potentialstr); free(potentialstr); hwloc_bitmap_free(potential); } } if (!hwloc_topology_is_thissystem(topology)) fprintf (output, "Topology not from this system\n"); } if (output != stdout) fclose(output); }
static void look_rset(int sdl, hwloc_obj_type_t type, struct hwloc_topology *topology, int level) { rsethandle_t rset, rad; int i,maxcpus,j; int nbnodes; struct hwloc_obj *obj; if ((topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) rset = rs_alloc(RS_ALL); else rset = rs_alloc(RS_PARTITION); rad = rs_alloc(RS_EMPTY); nbnodes = rs_numrads(rset, sdl, 0); if (nbnodes == -1) { perror("rs_numrads"); return; } for (i = 0; i < nbnodes; i++) { if (rs_getrad(rset, rad, sdl, i, 0)) { fprintf(stderr,"rs_getrad(%d) failed: %s\n", i, strerror(errno)); continue; } if (!rs_getinfo(rad, R_NUMPROCS, 0)) continue; /* It seems logical processors are numbered from 1 here, while the * bindprocessor functions numbers them from 0... */ obj = hwloc_alloc_setup_object(type, i - (type == HWLOC_OBJ_PU)); obj->cpuset = hwloc_bitmap_alloc(); obj->os_level = sdl; maxcpus = rs_getinfo(rad, R_MAXPROCS, 0); for (j = 0; j < maxcpus; j++) { if (rs_op(RS_TESTRESOURCE, rad, NULL, R_PROCS, j)) hwloc_bitmap_set(obj->cpuset, j); } switch(type) { case HWLOC_OBJ_NODE: obj->nodeset = hwloc_bitmap_alloc(); hwloc_bitmap_set(obj->nodeset, i); obj->memory.local_memory = 0; /* TODO: odd, rs_getinfo(rad, R_MEMSIZE, 0) << 10 returns the total memory ... */ obj->memory.page_types_len = 2; obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types)); memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types)); obj->memory.page_types[0].size = hwloc_getpagesize(); #ifdef HAVE__SC_LARGE_PAGESIZE obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE); #endif /* TODO: obj->memory.page_types[1].count = rs_getinfo(rset, R_LGPGFREE, 0) / hugepagesize */ break; case HWLOC_OBJ_CACHE: obj->attr->cache.size = _system_configuration.L2_cache_size; obj->attr->cache.associativity = _system_configuration.L2_cache_asc; obj->attr->cache.linesize = 0; /* TODO: ? */ obj->attr->cache.depth = 2; obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; /* FIXME? */ break; case HWLOC_OBJ_GROUP: obj->attr->group.depth = level; break; case HWLOC_OBJ_CORE: { hwloc_obj_t obj2, obj3; obj2 = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, i); obj2->cpuset = hwloc_bitmap_dup(obj->cpuset); obj2->attr->cache.size = _system_configuration.dcache_size; obj2->attr->cache.associativity = _system_configuration.dcache_asc; obj2->attr->cache.linesize = _system_configuration.dcache_line; obj2->attr->cache.depth = 1; if (_system_configuration.cache_attrib & (1<<30)) { /* Unified cache */ obj2->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; hwloc_debug("Adding an L1u cache for core %d\n", i); hwloc_insert_object_by_cpuset(topology, obj2); } else { /* Separate Instruction and Data caches */ obj2->attr->cache.type = HWLOC_OBJ_CACHE_DATA; hwloc_debug("Adding an L1d cache for core %d\n", i); hwloc_insert_object_by_cpuset(topology, obj2); obj3 = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, i); obj3->cpuset = hwloc_bitmap_dup(obj->cpuset); obj3->attr->cache.size = _system_configuration.icache_size; obj3->attr->cache.associativity = _system_configuration.icache_asc; obj3->attr->cache.linesize = _system_configuration.icache_line; obj3->attr->cache.depth = 1; obj3->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION; hwloc_debug("Adding an L1i cache for core %d\n", i); hwloc_insert_object_by_cpuset(topology, obj3); } break; } default: break; } hwloc_debug_2args_bitmap("%s %d has cpuset %s\n", hwloc_obj_type_string(type), i, obj->cpuset); hwloc_insert_object_by_cpuset(topology, obj); } rs_free(rset); rs_free(rad); }
static int hwloc_look_windows(struct hwloc_backend *backend) { struct hwloc_topology *topology = backend->topology; hwloc_bitmap_t groups_pu_set = NULL; SYSTEM_INFO SystemInfo; DWORD length; if (topology->levels[0][0]->cpuset) /* somebody discovered things */ return 0; hwloc_alloc_obj_cpusets(topology->levels[0][0]); GetSystemInfo(&SystemInfo); if (!GetLogicalProcessorInformationExProc && GetLogicalProcessorInformationProc) { PSYSTEM_LOGICAL_PROCESSOR_INFORMATION procInfo, tmpprocInfo; unsigned id; unsigned i; struct hwloc_obj *obj; hwloc_obj_type_t type; length = 0; procInfo = NULL; while (1) { if (GetLogicalProcessorInformationProc(procInfo, &length)) break; if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return -1; tmpprocInfo = realloc(procInfo, length); if (!tmpprocInfo) { free(procInfo); goto out; } procInfo = tmpprocInfo; } assert(!length || procInfo); for (i = 0; i < length / sizeof(*procInfo); i++) { /* Ignore unknown caches */ if (procInfo->Relationship == RelationCache && procInfo->Cache.Type != CacheUnified && procInfo->Cache.Type != CacheData && procInfo->Cache.Type != CacheInstruction) continue; id = -1; switch (procInfo[i].Relationship) { case RelationNumaNode: type = HWLOC_OBJ_NUMANODE; id = procInfo[i].NumaNode.NodeNumber; break; case RelationProcessorPackage: type = HWLOC_OBJ_PACKAGE; break; case RelationCache: type = HWLOC_OBJ_CACHE; break; case RelationProcessorCore: type = HWLOC_OBJ_CORE; break; case RelationGroup: default: type = HWLOC_OBJ_GROUP; break; } obj = hwloc_alloc_setup_object(type, id); obj->cpuset = hwloc_bitmap_alloc(); hwloc_debug("%s#%u mask %lx\n", hwloc_obj_type_string(type), id, procInfo[i].ProcessorMask); /* ProcessorMask is a ULONG_PTR */ hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, 0, procInfo[i].ProcessorMask); hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset); switch (type) { case HWLOC_OBJ_NUMANODE: { ULONGLONG avail; obj->nodeset = hwloc_bitmap_alloc(); hwloc_bitmap_set(obj->nodeset, id); if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail)) || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) obj->memory.local_memory = avail; obj->memory.page_types_len = 2; obj->memory.page_types = malloc(2 * sizeof(*obj->memory.page_types)); memset(obj->memory.page_types, 0, 2 * sizeof(*obj->memory.page_types)); obj->memory.page_types_len = 1; obj->memory.page_types[0].size = SystemInfo.dwPageSize; #ifdef HAVE__SC_LARGE_PAGESIZE obj->memory.page_types_len++; obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE); #endif break; } case HWLOC_OBJ_CACHE: obj->attr->cache.size = procInfo[i].Cache.Size; obj->attr->cache.associativity = procInfo[i].Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo[i].Cache.Associativity ; obj->attr->cache.linesize = procInfo[i].Cache.LineSize; obj->attr->cache.depth = procInfo[i].Cache.Level; switch (procInfo->Cache.Type) { case CacheUnified: obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; break; case CacheData: obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA; break; case CacheInstruction: obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION; break; default: hwloc_free_unlinked_object(obj); continue; } break; case HWLOC_OBJ_GROUP: obj->attr->group.depth = procInfo[i].Relationship == RelationGroup; break; default: break; } hwloc_insert_object_by_cpuset(topology, obj); } free(procInfo); } if (GetLogicalProcessorInformationExProc) { PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX procInfoTotal, tmpprocInfoTotal, procInfo; unsigned id; struct hwloc_obj *obj; hwloc_obj_type_t type; length = 0; procInfoTotal = NULL; while (1) { if (GetLogicalProcessorInformationExProc(RelationAll, procInfoTotal, &length)) break; if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) return -1; tmpprocInfoTotal = realloc(procInfoTotal, length); if (!tmpprocInfoTotal) { free(procInfoTotal); goto out; } procInfoTotal = tmpprocInfoTotal; } for (procInfo = procInfoTotal; (void*) procInfo < (void*) ((uintptr_t) procInfoTotal + length); procInfo = (void*) ((uintptr_t) procInfo + procInfo->Size)) { unsigned num, i; GROUP_AFFINITY *GroupMask; /* Ignore unknown caches */ if (procInfo->Relationship == RelationCache && procInfo->Cache.Type != CacheUnified && procInfo->Cache.Type != CacheData && procInfo->Cache.Type != CacheInstruction) continue; id = -1; switch (procInfo->Relationship) { case RelationNumaNode: type = HWLOC_OBJ_NUMANODE; num = 1; GroupMask = &procInfo->NumaNode.GroupMask; id = procInfo->NumaNode.NodeNumber; break; case RelationProcessorPackage: type = HWLOC_OBJ_PACKAGE; num = procInfo->Processor.GroupCount; GroupMask = procInfo->Processor.GroupMask; break; case RelationCache: type = HWLOC_OBJ_CACHE; num = 1; GroupMask = &procInfo->Cache.GroupMask; break; case RelationProcessorCore: type = HWLOC_OBJ_CORE; num = procInfo->Processor.GroupCount; GroupMask = procInfo->Processor.GroupMask; break; case RelationGroup: /* So strange an interface... */ for (id = 0; id < procInfo->Group.ActiveGroupCount; id++) { KAFFINITY mask; obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, id); obj->cpuset = hwloc_bitmap_alloc(); mask = procInfo->Group.GroupInfo[id].ActiveProcessorMask; hwloc_debug("group %u %d cpus mask %lx\n", id, procInfo->Group.GroupInfo[id].ActiveProcessorCount, mask); /* KAFFINITY is ULONG_PTR */ hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, id, mask); hwloc_debug_2args_bitmap("group %u %d bitmap %s\n", id, procInfo->Group.GroupInfo[id].ActiveProcessorCount, obj->cpuset); /* save the set of PUs so that we can create them at the end */ if (!groups_pu_set) groups_pu_set = hwloc_bitmap_alloc(); hwloc_bitmap_or(groups_pu_set, groups_pu_set, obj->cpuset); hwloc_insert_object_by_cpuset(topology, obj); } continue; default: /* Don't know how to get the mask. */ hwloc_debug("unknown relation %d\n", procInfo->Relationship); continue; } obj = hwloc_alloc_setup_object(type, id); obj->cpuset = hwloc_bitmap_alloc(); for (i = 0; i < num; i++) { hwloc_debug("%s#%u %d: mask %d:%lx\n", hwloc_obj_type_string(type), id, i, GroupMask[i].Group, GroupMask[i].Mask); /* GROUP_AFFINITY.Mask is KAFFINITY, which is ULONG_PTR */ hwloc_bitmap_set_ith_ULONG_PTR(obj->cpuset, GroupMask[i].Group, GroupMask[i].Mask); } hwloc_debug_2args_bitmap("%s#%u bitmap %s\n", hwloc_obj_type_string(type), id, obj->cpuset); switch (type) { case HWLOC_OBJ_NUMANODE: { ULONGLONG avail; obj->nodeset = hwloc_bitmap_alloc(); hwloc_bitmap_set(obj->nodeset, id); if ((GetNumaAvailableMemoryNodeExProc && GetNumaAvailableMemoryNodeExProc(id, &avail)) || (GetNumaAvailableMemoryNodeProc && GetNumaAvailableMemoryNodeProc(id, &avail))) obj->memory.local_memory = avail; obj->memory.page_types = malloc(2 * sizeof(*obj->memory.page_types)); memset(obj->memory.page_types, 0, 2 * sizeof(*obj->memory.page_types)); obj->memory.page_types_len = 1; obj->memory.page_types[0].size = SystemInfo.dwPageSize; #ifdef HAVE__SC_LARGE_PAGESIZE obj->memory.page_types_len++; obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE); #endif break; } case HWLOC_OBJ_CACHE: obj->attr->cache.size = procInfo->Cache.CacheSize; obj->attr->cache.associativity = procInfo->Cache.Associativity == CACHE_FULLY_ASSOCIATIVE ? -1 : procInfo->Cache.Associativity ; obj->attr->cache.linesize = procInfo->Cache.LineSize; obj->attr->cache.depth = procInfo->Cache.Level; switch (procInfo->Cache.Type) { case CacheUnified: obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED; break; case CacheData: obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA; break; case CacheInstruction: obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION; break; default: hwloc_free_unlinked_object(obj); continue; } break; default: break; } hwloc_insert_object_by_cpuset(topology, obj); } free(procInfoTotal); } if (groups_pu_set) { /* the system supports multiple Groups. * PU indexes may be discontiguous, especially if Groups contain less than 64 procs. */ hwloc_obj_t obj; unsigned idx; hwloc_bitmap_foreach_begin(idx, groups_pu_set) { obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, idx); obj->cpuset = hwloc_bitmap_alloc(); hwloc_bitmap_only(obj->cpuset, idx); hwloc_debug_1arg_bitmap("cpu %u has cpuset %s\n", idx, obj->cpuset); hwloc_insert_object_by_cpuset(topology, obj); } hwloc_bitmap_foreach_end(); hwloc_bitmap_free(groups_pu_set); } else {