int slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask) { int rval; char mstr[1 + CPU_SETSIZE / 4]; CPU_ZERO(mask); #ifdef SCHED_GETAFFINITY_THREE_ARGS rval = sched_getaffinity(pid, size, mask); #else rval = sched_getaffinity(pid, mask); #endif if (rval) { verbose("sched_getaffinity(%d,%zd,0x%s) failed with status %d", pid, size, cpuset_to_str(mask, mstr), rval); } else { debug3("sched_getaffinity(%d) = 0x%s", pid, cpuset_to_str(mask, mstr)); } return (rval); }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { cpu_set_t cur_mask; char mstr[1 + CPU_SETSIZE / 4]; slurm_getaffinity(0, sizeof(cur_mask), &cur_mask); cpuset_to_str(&cur_mask, mstr); verbose("%s loaded with CPU mask %s", plugin_name, mstr); return SLURM_SUCCESS; }
int set_affinity(pid_t pid, cpu_set_t *mask) { int ret; CPUSET_HEXSTRING(aff_hex); if((ret=sched_setaffinity(pid, sizeof(cpu_set_t), mask)) == -1) { decode_error("could not set PID %d to affinity 0x%s", pid, cpuset_to_str(mask, aff_hex) ); return(ret); } return(0); }
int slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask) { int rval; char mstr[1 + CPU_SETSIZE / 4]; #ifdef SCHED_GETAFFINITY_THREE_ARGS rval = sched_setaffinity(pid, size, mask); #else rval = sched_setaffinity(pid, mask); #endif if (rval) { verbose("sched_setaffinity(%d,%zd,0x%s) failed: %m", pid, size, cpuset_to_str(mask, mstr)); } return (rval); }
/* The job has specialized cores, synchronize user mask with available cores */ static void _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask) { char *new_mask = NULL, *save_ptr = NULL, *tok; cpu_set_t avail_cpus, task_cpus; bool superset = true; CPU_ZERO(&avail_cpus); (void) str_to_cpuset(&avail_cpus, avail_mask); tok = strtok_r(req->cpu_bind, ",", &save_ptr); while (tok) { int i, overlaps = 0; char mask_str[1 + CPU_SETSIZE / 4]; CPU_ZERO(&task_cpus); (void) str_to_cpuset(&task_cpus, tok); for (i = 0; i < CPU_SETSIZE; i++) { if (!CPU_ISSET(i, &task_cpus)) continue; if (CPU_ISSET(i, &avail_cpus)) { overlaps++; } else { CPU_CLR(i, &task_cpus); superset = false; } } if (overlaps == 0) { /* The task's CPU mask is completely invalid. * Give it all allowed CPUs. */ for (i = 0; i < CPU_SETSIZE; i++) { if (CPU_ISSET(i, &avail_cpus)) CPU_SET(i, &task_cpus); } } cpuset_to_str(&task_cpus, mask_str); if (new_mask) xstrcat(new_mask, ","); xstrcat(new_mask, mask_str); tok = strtok_r(NULL, ",", &save_ptr); } if (!superset) { info("task/affinity: Ignoring user CPU binding outside of job " "step allocation"); } xfree(req->cpu_bind); req->cpu_bind = new_mask; }
void slurm_chkaffinity(cpu_set_t *mask, slurmd_job_t *job, int statval) { char *bind_type, *action, *status, *units; char mstr[1 + CPU_SETSIZE / 4]; int task_gid = job->envtp->procid; int task_lid = job->envtp->localid; pid_t mypid = job->envtp->task_pid; if (!(job->cpu_bind_type & CPU_BIND_VERBOSE)) return; if (statval) status = " FAILED"; else status = ""; if (job->cpu_bind_type & CPU_BIND_NONE) { action = ""; units = ""; bind_type = "NONE"; } else { action = " set"; if (job->cpu_bind_type & CPU_BIND_TO_THREADS) units = "_threads"; else if (job->cpu_bind_type & CPU_BIND_TO_CORES) units = "_cores"; else if (job->cpu_bind_type & CPU_BIND_TO_SOCKETS) units = "_sockets"; else if (job->cpu_bind_type & CPU_BIND_TO_LDOMS) units = "_ldoms"; else units = ""; if (job->cpu_bind_type & CPU_BIND_RANK) { bind_type = "RANK"; } else if (job->cpu_bind_type & CPU_BIND_MAP) { bind_type = "MAP "; } else if (job->cpu_bind_type & CPU_BIND_MASK) { bind_type = "MASK"; } else if (job->cpu_bind_type & CPU_BIND_LDRANK) { bind_type = "LDRANK"; } else if (job->cpu_bind_type & CPU_BIND_LDMAP) { bind_type = "LDMAP "; } else if (job->cpu_bind_type & CPU_BIND_LDMASK) { bind_type = "LDMASK"; } else if (job->cpu_bind_type & (~CPU_BIND_VERBOSE)) { bind_type = "UNK "; } else { action = ""; bind_type = "NULL"; } } fprintf(stderr, "cpu_bind%s=%s - " "%s, task %2u %2u [%u]: mask 0x%s%s%s\n", units, bind_type, conf->hostname, task_gid, task_lid, mypid, cpuset_to_str(mask, mstr), action, status); }
int main(int argc, char *argv[]) { cpu_set_t new_mask, cur_mask; pid_t pid = 0; int opt, err; char mstr[1 + CPU_SETSIZE / 4]; char cstr[7 * CPU_SETSIZE]; int c_opt = 0; struct option longopts[] = { { "pid", 0, NULL, 'p' }, { "cpu-list", 0, NULL, 'c' }, { "help", 0, NULL, 'h' }, { "version", 0, NULL, 'V' }, { NULL, 0, NULL, 0 } }; while ((opt = getopt_long(argc, argv, "+pchV", longopts, NULL)) != -1) { int ret = 1; switch (opt) { case 'p': pid = atoi(argv[argc - 1]); break; case 'c': c_opt = 1; break; case 'V': printf("taskset version " VERSION "\n"); return 0; case 'h': ret = 0; default: show_usage(argv[0]); return ret; } } if ((!pid && argc - optind < 2) || (pid && (argc - optind < 1 || argc - optind > 2))) { show_usage(argv[0]); return 1; } if (pid) { if (sched_getaffinity(pid, sizeof (cur_mask), &cur_mask) < 0) { perror("sched_getaffinity"); fprintf(stderr, "failed to get pid %d's affinity\n", pid); return 1; } if (c_opt) printf("pid %d's current affinity list: %s\n", pid, cpuset_to_cstr(&cur_mask, cstr)); else printf("pid %d's current affinity mask: %s\n", pid, cpuset_to_str(&cur_mask, mstr)); if (argc - optind == 1) return 0; } if (c_opt) err = cstr_to_cpuset(&new_mask, argv[optind]); else err = str_to_cpuset(&new_mask, argv[optind]); if (err) { if (c_opt) fprintf(stderr, "failed to parse CPU list %s\n", argv[optind]); else fprintf(stderr, "failed to parse CPU mask %s\n", argv[optind]); return 1; } if (sched_setaffinity(pid, sizeof (new_mask), &new_mask)) { perror("sched_setaffinity"); fprintf(stderr, "failed to set pid %d's affinity.\n", pid); return 1; } if (sched_getaffinity(pid, sizeof (cur_mask), &cur_mask) < 0) { perror("sched_getaffinity"); fprintf(stderr, "failed to get pid %d's affinity.\n", pid); return 1; } if (pid) { if (c_opt) printf("pid %d's new affinity list: %s\n", pid, cpuset_to_cstr(&cur_mask, cstr)); else printf("pid %d's new affinity mask: %s\n", pid, cpuset_to_str(&cur_mask, mstr)); } else { argv += optind + 1; execvp(argv[0], argv); perror("execvp"); fprintf(stderr, "failed to execute %s\n", argv[0]); return 1; } return 0; }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else char mstr[1 + CPU_SETSIZE / 4]; cpu_bind_type_t bind_type; cpu_set_t ts; hwloc_obj_t obj; hwloc_obj_type_t socket_or_node; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int bind_verbose = 0; int rc = SLURM_SUCCESS, match; pid_t pid = job->envtp->task_pid; size_t tssize; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); hwloc_topology_load(topology); cpuset = hwloc_bitmap_alloc(); int spec_threads = 0; if (job->batch) { jnpus = job->cpus; job->cpus_per_task = job->cpus; } else jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if ((conf->task_plugin_param & CPU_BIND_VERBOSE) || (bind_type & CPU_BIND_VERBOSE)) bind_verbose = 1 ; if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else if (bind_type & CPU_BIND_TO_BOARDS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "board level binding",taskid); req_hwtype = HWLOC_OBJ_GROUP; } else if (bind_type & bind_mode_ldom) { req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); //info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if ((job->job_core_spec != (uint16_t) NO_VAL) && (job->job_core_spec & CORE_SPEC_THREAD) && (job->job_core_spec != CORE_SPEC_THREAD)) { spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD); } if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid, hwloc_obj_type_string(hwtype)); } else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) && (nobj < jnpus)) { info("task/cgroup: task[%u] not enough %s objects (%d < %d), " "disabling affinity", taskid, hwloc_obj_type_string(hwtype), nobj, jnpus); } else if (bind_type & bind_mode) { /* Explicit binding mode specified by the user * Bind the taskid in accordance with the specified mode */ obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0); match = hwloc_bitmap_isequal(obj->complete_cpuset, obj->allowed_cpuset); if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) { info("task/cgroup: entire node must be allocated, " "disabling affinity, task[%u]", taskid); fprintf(stderr, "Requested cpu_bind option requires " "entire node to be allocated; disabling " "affinity\n"); } else { if (bind_verbose) { info("task/cgroup: task[%u] is requesting " "explicit binding mode", taskid); } _get_sched_cpuset(topology, hwtype, req_hwtype, &ts, job); tssize = sizeof(cpu_set_t); fstatus = SLURM_SUCCESS; if (job->job_core_spec != (uint16_t) NO_VAL) _validate_mask(taskid, obj, &ts); if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); error("sched_setaffinity rc = %d", rc); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); } _slurm_chkaffinity(&ts, job, rc); } } else { /* Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). */ char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity dist %u", taskid, hwloc_obj_type_string(hwtype), job->task_dist); } /* See srun man page for detailed information on --distribution * option. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 368 */ switch (job->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: _task_cgroup_cpuset_dist_cyclic(topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, &ts, tssize) == 0) { fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "taskset '%s'", taskid, str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] set taskset '%s'", taskid, str); } _slurm_chkaffinity(&ts, job, rc); } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else char mstr[1 + CPU_SETSIZE / 4]; cpu_bind_type_t bind_type; cpu_set_t ts; hwloc_obj_t obj; hwloc_obj_type_t socket_or_node; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int bind_verbose = 0; int rc = SLURM_SUCCESS; pid_t pid = job->envtp->task_pid; size_t tssize; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if (conf->task_plugin_param & CPU_BIND_VERBOSE || bind_type & CPU_BIND_VERBOSE) bind_verbose = 1 ; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); hwloc_topology_load(topology); cpuset = hwloc_bitmap_alloc(); if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else if (bind_type & CPU_BIND_TO_BOARDS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "board level binding",taskid); req_hwtype = HWLOC_OBJ_GROUP; } else if (bind_type & bind_mode_ldom) { req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid,hwloc_obj_type_string(hwtype)); } else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 && jnpus > nobj) { info("task/cgroup: task[%u] not enough %s objects, disabling " "affinity",taskid,hwloc_obj_type_string(hwtype)); } else if (bind_type & bind_mode) { /* Explicit binding mode specified by the user * Bind the taskid in accordance with the specified mode */ obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0); if (!hwloc_bitmap_isequal(obj->complete_cpuset, obj->allowed_cpuset)) { info("task/cgroup: entire node must be allocated, " "disabling affinity, task[%u]", taskid); fprintf(stderr, "Requested cpu_bind option requires " "entire node to be allocated; disabling " "affinity\n"); } else { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "explicit binding mode",taskid); _get_sched_cpuset(topology, hwtype, req_hwtype, &ts, job); tssize = sizeof(cpu_set_t); fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); fstatus = SLURM_ERROR; } else if (bind_verbose) info("task/cgroup: task[%u] mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); slurm_chkaffinity(&ts, job, rc); } } else { /* Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). */ char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity", taskid,hwloc_obj_type_string(hwtype)); } /* There are two "distributions," controlled by the * -m option of srun and friends. The first is the * distribution of tasks to nodes. The second is the * distribution of allocated cpus to tasks for * binding. This code is handling the second * distribution. Here's how the values get set, based * on the value of -m * * SLURM_DIST_CYCLIC = srun -m cyclic * SLURM_DIST_BLOCK = srun -m block * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic * * In the first two cases, the user only specified the * first distribution. The second distribution * defaults to cyclic. In the second two cases, the * user explicitly requested a second distribution of * cyclic. So all these four cases correspond to a * second distribution of cyclic. So we want to call * _task_cgroup_cpuset_dist_cyclic. * * If the user explicitly specifies a second * distribution of block, or if * CR_CORE_DEFAULT_DIST_BLOCK is configured and the * user does not explicitly specify a second * distribution of cyclic, the second distribution is * block, and we need to call * _task_cgroup_cpuset_dist_block. In these cases, * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK * or SLURM_DIST_BLOCK_BLOCK. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 384. */ switch (job->task_dist) { case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: _task_cgroup_cpuset_dist_cyclic( topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; default: _task_cgroup_cpuset_dist_block( topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset, &ts,tssize) == 0) { fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid,tssize,&ts))) { error("task/cgroup: task[%u] unable to set " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] taskset '%s' is set" ,taskid,str); } slurm_chkaffinity(&ts, job, rc); } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
/* Be more careful with at least the affinity call; someone may use an affinity-compiled version on a non-affinity kernel. This is getting more and more fu-gly. */ void print_process(pid_t pid) { int policy, nice; struct sched_param_ex p; cpu_set_t aff_mask; CPUSET_HEXSTRING(aff_mask_hex); CPU_ZERO(&aff_mask); /* strict error checking not needed - it works or not. */ errno=0; if( ((policy=sched_getscheduler(pid)) < 0) || (sched_getparam_ex(pid, sizeof(p), &p) < 0) /* getpriority may successfully return negative values, so errno needs to be checked */ || ((nice=getpriority(PRIO_PROCESS, pid)) && errno) ) { decode_error("could not get scheduling-information for PID %d", pid); } else { /* do custom output for unknown policy */ if(! CHECK_RANGE_POLICY(policy)) { printf("PID %5d: PRIO %3d, POLICY %-5d <UNKNOWN>, NICE %3d", pid, p.sched_priority, policy, nice ); } else { printf("PID %5d: PRIO %3d, POLICY %-17s, NICE %3d", pid, p.sched_priority, TAB[policy], nice ); if (policy == SCHED_DEADLINE) { printf(", RUNTIME %Ldus DEADLINE %Ldus FLAGS 0x%04x" ", CURR. RUNTIME %Ldus USED RUNTIME %Ldus", tspec_to_us(&p.sched_runtime), tspec_to_us(&p.sched_deadline), p.sched_flags, tspec_to_us(&p.curr_runtime), tspec_to_us(&p.used_runtime)); } } /* sched_getaffinity() seems to also return (int)4 on 2.6.8+ on x86 when successful. this goes against the documentation */ if(sched_getaffinity(pid, sizeof(aff_mask), &aff_mask) == -1) { /* error or -ENOSYS simply ignore and reset errno! */ errno=0; } else { printf(", AFFINITY 0x%s", cpuset_to_str(&aff_mask, aff_mask_hex)); } printf("\n"); } }
int engine(struct engine_s *e) { int ret=0; int i; #ifdef DEBUG do { CPUSET_HEXSTRING(tmpaff); printf("Dumping mode: 0x%x\n", e->mode); printf("Dumping affinity: 0x%s\n", cpuset_to_str(&(e->aff_mask), tmpaff)); printf("We have %d args to do\n", e->n); for(i=0;i < e->n; i++) { printf("Dump arg %d: %s\n", i, e->args[i]); } } while(0); #endif /* handle normal query/set operation: set/query all given PIDs */ for(i=0; i < e->n; i++) { int pid, tmpret=0; cpu_set_t affi; CPU_ZERO(&affi); CPU_SET(0, &affi); /* if in MODE_EXEC skip check for PIDs */ if(mode_set(e->mode, MODE_EXEC)) { pid=getpid(); goto exec_mode_special; } if(! (isdigit( *(e->args[i])) ) ) { decode_error("Ignoring arg %s: is not a PID", e->args[i]); continue; } pid=atoi(e->args[i]); exec_mode_special: if(mode_set(e->mode, MODE_SETPOLICY)) { struct sched_param_ex p; p.sched_priority= e->prio; p.sched_runtime=us_to_tspec(e->rtime); p.sched_deadline=us_to_tspec(e->dline); p.sched_period=us_to_tspec(e->priod); p.sched_flags=e->flags; /* accumulate possible errors the return value of main will indicate how much set-calls went wrong set_process returns -1 upon failure */ tmpret=set_process(pid,e->policy,&p); ret += tmpret; /* don't proceed as something went wrong already */ if(tmpret) { continue; } } if(mode_set(e->mode, MODE_NICE)) { tmpret=set_niceness(pid, e->nice); ret += tmpret; if(tmpret) { continue; } } if(mode_set(e->mode, MODE_AFFINITY)) { tmpret=set_affinity(pid, &(e->aff_mask)); ret += tmpret; if(tmpret) { continue; } } /* and print process info when set, too */ if(mode_set(e->mode, MODE_PRINT)) { print_process(pid); } /* EXECUTE: at the end */ if(mode_set(e->mode, MODE_EXEC)) { char **new_argv=e->args; ret=execvp(*new_argv, new_argv); /* only reached on error */ decode_error("schedtool: Could not exec %s", *new_argv); return(ret); } } /* indicate how many errors we got; as ret is accumulated negative, convert to positive */ return(abs(ret)); }