static int _get_ldom_sched_cpuset(hwloc_topology_t topology, hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype, uint32_t ldom, cpu_set_t *mask) { hwloc_obj_t obj; hwloc_bitmap_t cpuset; int hwdepth; cpuset = hwloc_bitmap_alloc(); hwdepth = hwloc_get_type_depth(topology, hwtype); obj = hwloc_get_obj_by_depth(topology, hwdepth, ldom); _add_hwloc_cpuset(hwtype, req_hwtype, obj, 0, 0, cpuset); hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, mask, sizeof(cpu_set_t)); hwloc_bitmap_free(cpuset); return true; }
int main(void) { hwloc_topology_t topology; #ifdef HWLOC_HAVE_CPU_SET unsigned depth; hwloc_bitmap_t hwlocset; cpu_set_t schedset; hwloc_obj_t obj; int err; #endif /* HWLOC_HAVE_CPU_SET */ hwloc_topology_init(&topology); hwloc_topology_load(topology); #ifdef HWLOC_HAVE_CPU_SET depth = hwloc_topology_get_depth(topology); hwlocset = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset(topology)); hwloc_cpuset_to_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset)); #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY err = sched_setaffinity(0, sizeof(schedset)); #else err = sched_setaffinity(0, sizeof(schedset), &schedset); #endif assert(!err); hwloc_bitmap_free(hwlocset); #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY err = sched_getaffinity(0, sizeof(schedset)); #else err = sched_getaffinity(0, sizeof(schedset), &schedset); #endif assert(!err); hwlocset = hwloc_bitmap_alloc(); hwloc_cpuset_from_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset)); assert(hwloc_bitmap_isincluded(hwlocset, hwloc_topology_get_complete_cpuset(topology))); hwloc_bitmap_andnot(hwlocset, hwlocset, hwloc_topology_get_online_cpuset(topology)); hwloc_bitmap_andnot(hwlocset, hwlocset, hwloc_topology_get_allowed_cpuset(topology)); assert(hwloc_bitmap_iszero(hwlocset)); hwloc_bitmap_free(hwlocset); obj = hwloc_get_obj_by_depth(topology, depth-1, hwloc_get_nbobjs_by_depth(topology, depth-1) - 1); assert(obj); assert(obj->type == HWLOC_OBJ_PU); hwlocset = hwloc_bitmap_dup(obj->cpuset); hwloc_cpuset_to_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset)); #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY err = sched_setaffinity(0, sizeof(schedset)); #else err = sched_setaffinity(0, sizeof(schedset), &schedset); #endif assert(!err); hwloc_bitmap_free(hwlocset); #ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY err = sched_getaffinity(0, sizeof(schedset)); #else err = sched_getaffinity(0, sizeof(schedset), &schedset); #endif assert(!err); hwlocset = hwloc_bitmap_alloc(); hwloc_cpuset_from_glibc_sched_affinity(topology, hwlocset, &schedset, sizeof(schedset)); assert(hwloc_bitmap_isequal(hwlocset, obj->cpuset)); hwloc_bitmap_free(hwlocset); #endif /* HWLOC_HAVE_CPU_SET */ hwloc_topology_destroy(topology); return 0; }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else uint32_t i; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t pfirst,plast; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; pid_t pid = job->envtp->task_pid; cpu_bind_type_t bind_type; int verbose; hwloc_topology_t topology; #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_t cpuset,ct; #else hwloc_bitmap_t cpuset,ct; #endif hwloc_obj_t obj; struct hwloc_obj *pobj; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int hwdepth; size_t tssize; cpu_set_t ts; bind_type = job->cpu_bind_type ; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) verbose = 1 ; if (bind_type & CPU_BIND_NONE) { if (verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = HWLOC_OBJ_SOCKET; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else { if (verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); #if HWLOC_API_VERSION <= 0x00010000 cpuset = hwloc_cpuset_alloc() ; #else cpuset = hwloc_bitmap_alloc() ; #endif /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ hwloc_topology_load(topology); npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_SOCKET); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = HWLOC_OBJ_SOCKET; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & CPU_BIND_TO_LDOMS) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * Perform a block binding on the detected object respecting the * granularity. * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else { if (verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) { /* cores or threads granularity */ pfirst = taskid * job->cpus_per_task ; plast = pfirst + job->cpus_per_task - 1; } else { /* sockets or ldoms granularity */ pfirst = taskid; plast = pfirst; } hwdepth = hwloc_get_type_depth(topology,hwtype); for (i = pfirst; i <= plast && i < nobj ; i++) { obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i); /* if requested binding overlap the granularity */ /* use the ancestor cpuset instead of the object one */ if (hwloc_compare_types(hwtype,req_hwtype) > 0) { /* Get the parent object of req_hwtype or the */ /* one just above if not found (meaning of >0)*/ /* (useful for ldoms binding with !NUMA nodes)*/ pobj = obj->parent; while (pobj != NULL && hwloc_compare_types(pobj->type, req_hwtype) > 0) pobj = pobj->parent; if (pobj != NULL) { if (verbose) info("task/cgroup: task[%u] " "higher level %s found", taskid, hwloc_obj_type_string( pobj->type)); #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(pobj-> allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(pobj-> allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } else { /* should not be executed */ if (verbose) info("task/cgroup: task[%u] " "no higher level found", taskid); #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(obj-> allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(obj-> allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } } else { #if HWLOC_API_VERSION <= 0x00010000 ct = hwloc_cpuset_dup(obj->allowed_cpuset); hwloc_cpuset_or(cpuset,cpuset,ct); hwloc_cpuset_free(ct); #else ct = hwloc_bitmap_dup(obj->allowed_cpuset); hwloc_bitmap_or(cpuset,cpuset,ct); hwloc_bitmap_free(ct); #endif } } char *str; #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_asprintf(&str,cpuset); #else hwloc_bitmap_asprintf(&str,cpuset); #endif tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if (sched_setaffinity(pid,tssize,&ts)) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ #if HWLOC_API_VERSION <= 0x00010000 hwloc_cpuset_free(cpuset); #else hwloc_bitmap_free(cpuset); #endif hwloc_topology_destroy(topology); return fstatus; #endif }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else hwloc_obj_type_t socket_or_node; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; pid_t pid = job->envtp->task_pid; cpu_bind_type_t bind_type; int bind_verbose = 0; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; size_t tssize; cpu_set_t ts; bind_type = job->cpu_bind_type ; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) bind_verbose = 1 ; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); cpuset = hwloc_bitmap_alloc(); hwloc_topology_load(topology); if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & CPU_BIND_TO_LDOMS) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else { char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } /* There are two "distributions," controlled by the * -m option of srun and friends. The first is the * distribution of tasks to nodes. The second is the * distribution of allocated cpus to tasks for * binding. This code is handling the second * distribution. Here's how the values get set, based * on the value of -m * * SLURM_DIST_CYCLIC = srun -m cyclic * SLURM_DIST_BLOCK = srun -m block * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic * * In the first two cases, the user only specified the * first distribution. The second distribution * defaults to cyclic. In the second two cases, the * user explicitly requested a second distribution of * cyclic. So all these four cases correspond to a * second distribution of cyclic. So we want to call * _task_cgroup_cpuset_dist_cyclic. * * If the user explicitly specifies a second * distribution of block, or if * CR_CORE_DEFAULT_DIST_BLOCK is configured and the * user does not explicitly specify a second * distribution of cyclic, the second distribution is * block, and we need to call * _task_cgroup_cpuset_dist_block. In these cases, * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK * or SLURM_DIST_BLOCK_BLOCK. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 384. */ switch (job->task_dist) { case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: _task_cgroup_cpuset_dist_cyclic( topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; default: _task_cgroup_cpuset_dist_block( topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if (sched_setaffinity(pid,tssize,&ts)) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else char mstr[1 + CPU_SETSIZE / 4]; cpu_bind_type_t bind_type; cpu_set_t ts; hwloc_obj_t obj; hwloc_obj_type_t socket_or_node; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int bind_verbose = 0; int rc = SLURM_SUCCESS, match; pid_t pid = job->envtp->task_pid; size_t tssize; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); hwloc_topology_load(topology); cpuset = hwloc_bitmap_alloc(); int spec_threads = 0; if (job->batch) { jnpus = job->cpus; job->cpus_per_task = job->cpus; } else jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if ((conf->task_plugin_param & CPU_BIND_VERBOSE) || (bind_type & CPU_BIND_VERBOSE)) bind_verbose = 1 ; if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else if (bind_type & CPU_BIND_TO_BOARDS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "board level binding",taskid); req_hwtype = HWLOC_OBJ_GROUP; } else if (bind_type & bind_mode_ldom) { req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); //info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if ((job->job_core_spec != (uint16_t) NO_VAL) && (job->job_core_spec & CORE_SPEC_THREAD) && (job->job_core_spec != CORE_SPEC_THREAD)) { spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD); } if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid, hwloc_obj_type_string(hwtype)); } else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) && (nobj < jnpus)) { info("task/cgroup: task[%u] not enough %s objects (%d < %d), " "disabling affinity", taskid, hwloc_obj_type_string(hwtype), nobj, jnpus); } else if (bind_type & bind_mode) { /* Explicit binding mode specified by the user * Bind the taskid in accordance with the specified mode */ obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0); match = hwloc_bitmap_isequal(obj->complete_cpuset, obj->allowed_cpuset); if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) { info("task/cgroup: entire node must be allocated, " "disabling affinity, task[%u]", taskid); fprintf(stderr, "Requested cpu_bind option requires " "entire node to be allocated; disabling " "affinity\n"); } else { if (bind_verbose) { info("task/cgroup: task[%u] is requesting " "explicit binding mode", taskid); } _get_sched_cpuset(topology, hwtype, req_hwtype, &ts, job); tssize = sizeof(cpu_set_t); fstatus = SLURM_SUCCESS; if (job->job_core_spec != (uint16_t) NO_VAL) _validate_mask(taskid, obj, &ts); if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); error("sched_setaffinity rc = %d", rc); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); } _slurm_chkaffinity(&ts, job, rc); } } else { /* Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). */ char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity dist %u", taskid, hwloc_obj_type_string(hwtype), job->task_dist); } /* See srun man page for detailed information on --distribution * option. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 368 */ switch (job->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: _task_cgroup_cpuset_dist_cyclic(topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, &ts, tssize) == 0) { fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "taskset '%s'", taskid, str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] set taskset '%s'", taskid, str); } _slurm_chkaffinity(&ts, job, rc); } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }