static int _parse_resv_core_cnt(resv_desc_msg_t *resv_msg_ptr, char *val, bool from_tres) { char *endptr = NULL, *core_cnt, *tok, *ptrptr = NULL; char *type; int node_inx = 0; type = slurm_get_select_type(); if (strcasestr(type, "cray")) { int param; param = slurm_get_select_type_param(); if (! (param & CR_OTHER_CONS_RES)) { error("CoreCnt or CPUCnt is only " "suported when " "SelectTypeParameters " "includes OTHER_CONS_RES"); xfree(type); return SLURM_ERROR; } } else if (strcasestr(type, "cons_res") == NULL) { error("CoreCnt or CPUCnt is only " "suported when " "SelectType includes " "select/cons_res"); xfree(type); return SLURM_ERROR; } xfree(type); core_cnt = xstrdup(val); tok = strtok_r(core_cnt, ",", &ptrptr); while (tok) { xrealloc(resv_msg_ptr->core_cnt, sizeof(uint32_t) * (node_inx + 2)); resv_msg_ptr->core_cnt[node_inx] = strtol(tok, &endptr, 10); if ((endptr == NULL) || (endptr[0] != '\0') || (tok[0] == '\0')) { exit_code = 1; if (from_tres) error("Invalid TRES core count %s", val); else error("Invalid core count %s", val); xfree(core_cnt); return SLURM_ERROR; } node_inx++; tok = strtok_r(NULL, ",", &ptrptr); } xfree(core_cnt); return SLURM_SUCCESS; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init ( void ) { /* We must call the api here since we call this from other * things other than the slurmctld. */ uint16_t select_type_param = slurm_get_select_type_param(); if (select_type_param & CR_OTHER_CONS_RES) plugin_id = 108; debug_flags = slurm_get_debug_flags(); verbose("%s loaded", plugin_name); return SLURM_SUCCESS; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init ( void ) { /* We must call the api here since we call this from other * things other than the slurmctld. */ uint16_t select_type_param = slurm_get_select_type_param(); if (select_type_param & CR_OTHER_CONS_RES) plugin_id = 108; debug_flags = slurm_get_debug_flags(); #ifdef HAVE_NATIVE_CRAY // Spawn the aeld thread, only in slurmctld. if (run_in_daemon("slurmctld")) { _spawn_cleanup_thread(NULL, _aeld_event_loop); } #endif verbose("%s loaded", plugin_name); return SLURM_SUCCESS; }
/* * Initialize context for node selection plugin */ extern int slurm_select_init(bool only_default) { int retval = SLURM_SUCCESS; char *type = NULL; int i, j, len; DIR *dirp; struct dirent *e; char *dir_array = NULL, *head = NULL; char *plugin_type = "select"; if ( init_run && select_context ) return retval; slurm_mutex_lock( &select_context_lock ); if ( select_context ) goto done; type = slurm_get_select_type(); if (working_cluster_rec) { /* just ignore warnings here */ } else { #ifdef HAVE_XCPU if (strcasecmp(type, "select/linear")) { error("%s is incompatible with XCPU use", type); fatal("Use SelectType=select/linear"); } #endif if (!strcasecmp(type, "select/linear")) { uint16_t cr_type = slurm_get_select_type_param(); if ((cr_type & CR_SOCKET) || (cr_type & CR_CORE) || (cr_type & CR_CPU)) fatal("Invalid SelectTypeParameter " "for select/linear"); } #ifdef HAVE_BG if (strcasecmp(type, "select/bluegene")) { error("%s is incompatible with BlueGene", type); fatal("Use SelectType=select/bluegene"); } #else if (!strcasecmp(type, "select/bluegene")) { fatal("Requested SelectType=select/bluegene " "in slurm.conf, but not running on a BG[L|P|Q] " "system. If looking to emulate a BG[L|P|Q] " "system use --enable-bgl-emulation or " "--enable-bgp-emulation respectively."); } #endif #ifdef HAVE_ALPS_CRAY if (strcasecmp(type, "select/alps")) { error("%s is incompatible with Cray system " "running alps", type); fatal("Use SelectType=select/alps"); } #else if (!strcasecmp(type, "select/alps")) { fatal("Requested SelectType=select/alps " "in slurm.conf, but not running on a ALPS Cray " "system. If looking to emulate a Alps Cray " "system use --enable-alps-cray-emulation."); } #endif } select_context_cnt = 0; if (only_default) { ops = xmalloc(sizeof(slurm_select_ops_t)); select_context = xmalloc(sizeof(plugin_context_t)); if ((select_context[0] = plugin_context_create( plugin_type, type, (void **)&ops[0], node_select_syms, sizeof(node_select_syms)))) { select_context_default = 0; select_context_cnt++; } goto skip_load_all; } if (!(dir_array = slurm_get_plugin_dir())) { error("plugin_load_and_link: No plugin dir given"); goto done; } head = dir_array; for (i=0; ; i++) { bool got_colon = 0; if (dir_array[i] == ':') { dir_array[i] = '\0'; got_colon = 1; } else if (dir_array[i] != '\0') continue; /* Open the directory. */ if (!(dirp = opendir(head))) { error("cannot open plugin directory %s", head); goto done; } while (1) { char full_name[128]; if (!(e = readdir( dirp ))) break; /* Check only files with select_ in them. */ if (strncmp(e->d_name, "select_", 7)) continue; len = strlen(e->d_name); #if defined(__CYGWIN__) len -= 4; #else len -= 3; #endif /* Check only shared object files */ if (strcmp(e->d_name+len, #if defined(__CYGWIN__) ".dll" #else ".so" #endif )) continue; /* add one for the / */ len++; xassert(len<sizeof(full_name)); snprintf(full_name, len, "select/%s", e->d_name+7); for (j=0; j<select_context_cnt; j++) { if (!strcmp(full_name, select_context[j]->type)) break; } if (j >= select_context_cnt) { xrealloc(ops, (sizeof(slurm_select_ops_t) * (select_context_cnt + 1))); xrealloc(select_context, (sizeof(plugin_context_t) * (select_context_cnt + 1))); select_context[select_context_cnt] = plugin_context_create( plugin_type, full_name, (void **)&ops[ select_context_cnt], node_select_syms, sizeof(node_select_syms)); if (select_context[select_context_cnt]) { /* set the default */ if (!strcmp(full_name, type)) select_context_default = select_context_cnt; select_context_cnt++; } } } closedir(dirp); if (got_colon) { head = dir_array + i + 1; } else break; } skip_load_all: if (select_context_default == -1) fatal("Can't find plugin for %s", type); /* Insure that plugin_id is valid and unique */ for (i=0; i<select_context_cnt; i++) { for (j=i+1; j<select_context_cnt; j++) { if (*(ops[i].plugin_id) != *(ops[j].plugin_id)) continue; fatal("SelectPlugins: Duplicate plugin_id %u for " "%s and %s", *(ops[i].plugin_id), select_context[i]->type, select_context[j]->type); } if (*(ops[i].plugin_id) < 100) { fatal("SelectPlugins: Invalid plugin_id %u (<100) %s", *(ops[i].plugin_id), select_context[i]->type); } } init_run = true; done: slurm_mutex_unlock( &select_context_lock ); xfree(type); xfree(dir_array); return retval; }
/* * lllp_distribution * * Note: lllp stands for Lowest Level of Logical Processors. * * When automatic binding is enabled: * - no binding flags set >= CPU_BIND_NONE, and * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} * Otherwise limit job step to the allocated CPUs * * generate the appropriate cpu_bind type and string which results in * the specified lllp distribution. * * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) * IN- global task id array */ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) { int rc = SLURM_SUCCESS; bitstr_t **masks = NULL; char buf_type[100]; int maxtasks = req->tasks_to_launch[(int)node_id]; int whole_nodes, whole_sockets, whole_cores, whole_threads; int part_sockets, part_cores; const uint32_t *gtid = req->global_task_ids[(int)node_id]; static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK | CPU_BIND_RANK | CPU_BIND_MAP | CPU_BIND_LDMASK | CPU_BIND_LDRANK | CPU_BIND_LDMAP; static int only_one_thread_per_core = -1; if (only_one_thread_per_core == -1) { if (conf->cpus == (conf->sockets * conf->cores)) only_one_thread_per_core = 1; else only_one_thread_per_core = 0; } /* If we are telling the system we only want to use 1 thread * per core with the CPUs node option this is the easiest way * to portray that to the affinity plugin. */ if (only_one_thread_per_core) req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; if (req->cpu_bind_type & bind_mode) { /* Explicit step binding specified by user */ char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if ((whole_nodes == 0) && avail_mask && (req->job_core_spec == (uint16_t) NO_VAL)) { info("task/affinity: entire node must be allocated, " "disabling affinity"); xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } else { if (req->job_core_spec == (uint16_t) NO_VAL) { if (req->cpu_bind_type & CPU_BIND_MASK) _validate_mask(req, avail_mask); else if (req->cpu_bind_type & CPU_BIND_MAP) _validate_map(req, avail_mask); } xfree(avail_mask); } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] manual binding: %s", req->job_id, buf_type); return; } if (!(req->cpu_bind_type & bind_entity)) { /* No bind unit (sockets, cores) specified by user, * pick something reasonable */ uint32_t task_plugin_param = slurm_get_task_plugin_param(); bool auto_def_set = false; int spec_thread_cnt = 0; int max_tasks = req->tasks_to_launch[(int)node_id] * req->cpus_per_task; char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); debug("binding tasks:%d to " "nodes:%d sockets:%d:%d cores:%d:%d threads:%d", max_tasks, whole_nodes, whole_sockets ,part_sockets, whole_cores, part_cores, whole_threads); if ((req->job_core_spec != (uint16_t) NO_VAL) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); } if (((max_tasks == whole_sockets) && (part_sockets == 0)) || (spec_thread_cnt && (max_tasks == (whole_sockets + part_sockets)))) { req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if (((max_tasks == whole_cores) && (part_cores == 0)) || (spec_thread_cnt && (max_tasks == (whole_cores + part_cores)))) { req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } if (max_tasks == whole_threads) { req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } if (task_plugin_param & CPU_AUTO_BIND_TO_THREADS) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } else if (task_plugin_param & CPU_AUTO_BIND_TO_CORES) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } else if (task_plugin_param & CPU_AUTO_BIND_TO_SOCKETS) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] auto binding off: %s", req->job_id, buf_type); return; make_auto: xfree(avail_mask); slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] %s auto binding: " "%s, dist %d", req->job_id, (auto_def_set) ? "default" : "implicit", buf_type, req->task_dist); } else { /* Explicit bind unit (sockets, cores) specified by user */ slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] binding: %s, dist %d", req->job_id, buf_type, req->task_dist); } switch (req->task_dist & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ rc = _task_layout_lllp_block(req, node_id, &masks); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { rc = _task_layout_lllp_block(req, node_id, &masks); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: rc = _task_layout_lllp_cyclic(req, node_id, &masks); break; } /* FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & * max_cores - does select/cons_res plugin allocate whole * socket??? Maybe not. Check srun man page. */ if (rc == SLURM_SUCCESS) { _task_layout_display_masks(req, gtid, maxtasks, masks); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); } #endif /* convert masks into cpu_bind mask string */ _lllp_generate_cpu_bind(req, maxtasks, masks); } else { char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); error("lllp_distribution jobid [%u] overriding binding: %s", req->job_id, buf_type); error("Verify socket/core/thread counts in configuration"); } if (masks) _lllp_free_masks(maxtasks, masks); }
/* user to have to play with the cgroup hierarchy to modify it */ extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; #ifndef HAVE_HWLOC error("task/cgroup: plugin not compiled with hwloc support, " "skipping affinity."); return fstatus; #else char mstr[1 + CPU_SETSIZE / 4]; cpu_bind_type_t bind_type; cpu_set_t ts; hwloc_obj_t obj; hwloc_obj_type_t socket_or_node; hwloc_topology_t topology; hwloc_bitmap_t cpuset; hwloc_obj_type_t hwtype; hwloc_obj_type_t req_hwtype; int bind_verbose = 0; int rc = SLURM_SUCCESS, match; pid_t pid = job->envtp->task_pid; size_t tssize; uint32_t nldoms; uint32_t nsockets; uint32_t ncores; uint32_t npus; uint32_t nobj; uint32_t taskid = job->envtp->localid; uint32_t jntasks = job->node_tasks; uint32_t jnpus; /* Allocate and initialize hwloc objects */ hwloc_topology_init(&topology); hwloc_topology_load(topology); cpuset = hwloc_bitmap_alloc(); int spec_threads = 0; if (job->batch) { jnpus = job->cpus; job->cpus_per_task = job->cpus; } else jnpus = jntasks * job->cpus_per_task; bind_type = job->cpu_bind_type; if ((conf->task_plugin_param & CPU_BIND_VERBOSE) || (bind_type & CPU_BIND_VERBOSE)) bind_verbose = 1 ; if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) > hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) { /* One socket contains multiple NUMA-nodes * like AMD Opteron 6000 series etc. * In such case, use NUMA-node instead of socket. */ socket_or_node = HWLOC_OBJ_NODE; } else { socket_or_node = HWLOC_OBJ_SOCKET; } if (bind_type & CPU_BIND_NONE) { if (bind_verbose) info("task/cgroup: task[%u] is requesting no affinity", taskid); return 0; } else if (bind_type & CPU_BIND_TO_THREADS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "thread level binding",taskid); req_hwtype = HWLOC_OBJ_PU; } else if (bind_type & CPU_BIND_TO_CORES) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "core level binding",taskid); req_hwtype = HWLOC_OBJ_CORE; } else if (bind_type & CPU_BIND_TO_SOCKETS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "socket level binding",taskid); req_hwtype = socket_or_node; } else if (bind_type & CPU_BIND_TO_LDOMS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "ldom level binding",taskid); req_hwtype = HWLOC_OBJ_NODE; } else if (bind_type & CPU_BIND_TO_BOARDS) { if (bind_verbose) info("task/cgroup: task[%u] is requesting " "board level binding",taskid); req_hwtype = HWLOC_OBJ_GROUP; } else if (bind_type & bind_mode_ldom) { req_hwtype = HWLOC_OBJ_NODE; } else { if (bind_verbose) info("task/cgroup: task[%u] using core level binding" " by default",taskid); req_hwtype = HWLOC_OBJ_CORE; } /* * Perform the topology detection. It will only get allowed PUs. * Detect in the same time the granularity to use for binding. * The granularity can be relaxed from threads to cores if enough * cores are available as with hyperthread support, ntasks-per-core * param can let us have access to more threads per core for each * task * Revert back to machine granularity if no finer-grained granularity * matching the request is found. This will result in no affinity * applied. * The detected granularity will be used to find where to best place * the task, then the cpu_bind option will be used to relax the * affinity constraint and use more PUs. (i.e. use a core granularity * to dispatch the tasks across the sockets and then provide access * to each task to the cores of its socket.) */ npus = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_CORE); nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology, socket_or_node); nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_NODE); //info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms); hwtype = HWLOC_OBJ_MACHINE; nobj = 1; if ((job->job_core_spec != (uint16_t) NO_VAL) && (job->job_core_spec & CORE_SPEC_THREAD) && (job->job_core_spec != CORE_SPEC_THREAD)) { spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD); } if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) { hwtype = HWLOC_OBJ_PU; nobj = npus; } if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) { hwtype = HWLOC_OBJ_CORE; nobj = ncores; } if (nsockets >= jntasks && bind_type & CPU_BIND_TO_SOCKETS) { hwtype = socket_or_node; nobj = nsockets; } /* * HWLOC returns all the NUMA nodes available regardless of the * number of underlying sockets available (regardless of the allowed * resources). So there is no guarantee that each ldom will be populated * with usable sockets. So add a simple check that at least ensure that * we have as many sockets as ldoms before moving to ldoms granularity */ if (nldoms >= jntasks && nsockets >= nldoms && bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) { hwtype = HWLOC_OBJ_NODE; nobj = nldoms; } /* * If not enough objects to do the job, revert to no affinity mode */ if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) { info("task/cgroup: task[%u] disabling affinity because of %s " "granularity",taskid, hwloc_obj_type_string(hwtype)); } else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) && (nobj < jnpus)) { info("task/cgroup: task[%u] not enough %s objects (%d < %d), " "disabling affinity", taskid, hwloc_obj_type_string(hwtype), nobj, jnpus); } else if (bind_type & bind_mode) { /* Explicit binding mode specified by the user * Bind the taskid in accordance with the specified mode */ obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0); match = hwloc_bitmap_isequal(obj->complete_cpuset, obj->allowed_cpuset); if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) { info("task/cgroup: entire node must be allocated, " "disabling affinity, task[%u]", taskid); fprintf(stderr, "Requested cpu_bind option requires " "entire node to be allocated; disabling " "affinity\n"); } else { if (bind_verbose) { info("task/cgroup: task[%u] is requesting " "explicit binding mode", taskid); } _get_sched_cpuset(topology, hwtype, req_hwtype, &ts, job); tssize = sizeof(cpu_set_t); fstatus = SLURM_SUCCESS; if (job->job_core_spec != (uint16_t) NO_VAL) _validate_mask(taskid, obj, &ts); if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); error("sched_setaffinity rc = %d", rc); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] mask 0x%s", taskid, cpuset_to_str(&ts, mstr)); } _slurm_chkaffinity(&ts, job, rc); } } else { /* Bind the detected object to the taskid, respecting the * granularity, using the designated or default distribution * method (block or cyclic). */ char *str; if (bind_verbose) { info("task/cgroup: task[%u] using %s granularity dist %u", taskid, hwloc_obj_type_string(hwtype), job->task_dist); } /* See srun man page for detailed information on --distribution * option. * * You can see the equivalent code for the * task/affinity plugin in * src/plugins/task/affinity/dist_tasks.c, around line 368 */ switch (job->task_dist & SLURM_DIST_NODESOCKMASK) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { _task_cgroup_cpuset_dist_block(topology, hwtype, req_hwtype, nobj, job, bind_verbose, cpuset); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: _task_cgroup_cpuset_dist_cyclic(topology, hwtype, req_hwtype, job, bind_verbose, cpuset); break; } hwloc_bitmap_asprintf(&str, cpuset); tssize = sizeof(cpu_set_t); if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset, &ts, tssize) == 0) { fstatus = SLURM_SUCCESS; if ((rc = sched_setaffinity(pid, tssize, &ts))) { error("task/cgroup: task[%u] unable to set " "taskset '%s'", taskid, str); fstatus = SLURM_ERROR; } else if (bind_verbose) { info("task/cgroup: task[%u] set taskset '%s'", taskid, str); } _slurm_chkaffinity(&ts, job, rc); } else { error("task/cgroup: task[%u] unable to build " "taskset '%s'",taskid,str); fstatus = SLURM_ERROR; } free(str); } /* Destroy hwloc objects */ hwloc_bitmap_free(cpuset); hwloc_topology_destroy(topology); return fstatus; #endif }
/* Get this plugin's sequence number in Slurm's internal tables */ extern int select_get_plugin_id_pos(uint32_t plugin_id) { int i; static bool cray_other_cons_res = false; if (slurm_select_init(0) < 0) return SLURM_ERROR; again: for (i = 0; i < select_context_cnt; i++) { if (*(ops[i].plugin_id) == plugin_id) break; } if (i >= select_context_cnt) { /* * Put on the extra Cray select plugins that do not get * generated automatically. */ if (!cray_other_cons_res && ((plugin_id == SELECT_PLUGIN_CRAY_CONS_RES) || (plugin_id == SELECT_PLUGIN_CRAY_CONS_TRES) || (plugin_id == SELECT_PLUGIN_CRAY_LINEAR))) { char *type = "select", *name = "select/cray"; uint16_t save_params = slurm_get_select_type_param(); uint16_t params[2]; int cray_plugin_id[2], cray_offset; cray_other_cons_res = true; if (plugin_id == SELECT_PLUGIN_CRAY_LINEAR) { params[0] = save_params & ~CR_OTHER_CONS_RES; cray_plugin_id[0] = SELECT_PLUGIN_CRAY_CONS_RES; params[1] = save_params & ~CR_OTHER_CONS_TRES; cray_plugin_id[1] = SELECT_PLUGIN_CRAY_CONS_TRES; } else if (plugin_id == SELECT_PLUGIN_CRAY_CONS_RES) { params[0] = save_params | CR_OTHER_CONS_RES; cray_plugin_id[0] = SELECT_PLUGIN_CRAY_LINEAR; params[1] = save_params & ~CR_OTHER_CONS_RES; cray_plugin_id[1] = SELECT_PLUGIN_CRAY_CONS_TRES; } else { /* SELECT_PLUGIN_CRAY_CONS_TRES */ params[0] = save_params | CR_OTHER_CONS_TRES; cray_plugin_id[0] = SELECT_PLUGIN_CRAY_LINEAR; params[1] = save_params & ~CR_OTHER_CONS_RES; cray_plugin_id[1] = SELECT_PLUGIN_CRAY_CONS_RES; } for (cray_offset = 0; cray_offset < 2; cray_offset++) { for (i = 0; i < select_context_cnt; i++) { if (*(ops[i].plugin_id) == cray_plugin_id[cray_offset]) break; } if (i < select_context_cnt) break; /* Found match */ } if (i >= select_context_cnt) goto end_it; /* No match */ slurm_mutex_lock(&select_context_lock); slurm_set_select_type_param(params[cray_offset]); plugin_context_destroy(select_context[i]); select_context[i] = plugin_context_create(type, name, (void **)&ops[i], node_select_syms, sizeof(node_select_syms)); slurm_set_select_type_param(save_params); slurm_mutex_unlock(&select_context_lock); goto again; } end_it: return SLURM_ERROR; } return i; }
/* * Initialize context for node selection plugin */ extern int slurm_select_init(bool only_default) { int retval = SLURM_SUCCESS; char *select_type = NULL; int i, j, plugin_cnt; char *plugin_type = "select"; List plugin_names = NULL; _plugin_args_t plugin_args = {0}; if ( init_run && select_context ) return retval; slurm_mutex_lock( &select_context_lock ); if ( select_context ) goto done; select_type = slurm_get_select_type(); if (working_cluster_rec) { /* just ignore warnings here */ } else { #ifdef HAVE_NATIVE_CRAY if (xstrcasecmp(select_type, "select/cray")) { error("%s is incompatible with a native Cray system.", select_type); fatal("Use SelectType=select/cray"); } #else /* if (!xstrcasecmp(select_type, "select/cray")) { */ /* fatal("Requested SelectType=select/cray " */ /* "in slurm.conf, but not running on a native Cray " */ /* "system. If looking to run on a Cray " */ /* "system natively use --enable-native-cray."); */ /* } */ #endif } select_context_cnt = 0; plugin_args.plugin_type = plugin_type; plugin_args.default_plugin = select_type; if (only_default) { plugin_names = list_create(slurm_destroy_char); list_append(plugin_names, xstrdup(select_type)); } else { plugin_names = plugin_get_plugins_of_type(plugin_type); } if (plugin_names && (plugin_cnt = list_count(plugin_names))) { ops = xcalloc(plugin_cnt, sizeof(slurm_select_ops_t)); select_context = xcalloc(plugin_cnt, sizeof(plugin_context_t *)); list_for_each(plugin_names, _load_plugins, &plugin_args); } if (select_context_default == -1) fatal("Can't find plugin for %s", select_type); /* Ensure that plugin_id is valid and unique */ for (i=0; i<select_context_cnt; i++) { for (j=i+1; j<select_context_cnt; j++) { if (*(ops[i].plugin_id) != *(ops[j].plugin_id)) continue; fatal("SelectPlugins: Duplicate plugin_id %u for " "%s and %s", *(ops[i].plugin_id), select_context[i]->type, select_context[j]->type); } if (*(ops[i].plugin_id) < 100) { fatal("SelectPlugins: Invalid plugin_id %u (<100) %s", *(ops[i].plugin_id), select_context[i]->type); } } init_run = true; done: slurm_mutex_unlock( &select_context_lock ); if (!working_cluster_rec) { if (select_running_linear_based()) { uint16_t cr_type = slurm_get_select_type_param(); if (cr_type & (CR_CPU | CR_CORE | CR_SOCKET)) { fatal("Invalid SelectTypeParameters for " "%s: %s (%u), it can't contain " "CR_(CPU|CORE|SOCKET).", select_type, select_type_param_string(cr_type), cr_type); } } } xfree(select_type); FREE_NULL_LIST(plugin_names); return retval; }
static int _task_layout_block(slurm_step_layout_t *step_layout, uint16_t *cpus) { static uint16_t select_params = NO_VAL16; int i, j, task_id = 0; bool pack_nodes; if (select_params == NO_VAL16) select_params = slurm_get_select_type_param(); if (step_layout->task_dist & SLURM_DIST_PACK_NODES) pack_nodes = true; else if (step_layout->task_dist & SLURM_DIST_NO_PACK_NODES) pack_nodes = false; else if (select_params & CR_PACK_NODES) pack_nodes = true; else pack_nodes = false; if (pack_nodes) { /* Pass 1: Put one task on each node */ for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { /* cpus has already been altered for cpus_per_task */ if (step_layout->tasks[i] < cpus[i]) { step_layout->tasks[i]++; task_id++; } } /* Pass 2: Fill remaining CPUs on a node-by-node basis */ for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { /* cpus has already been altered for cpus_per_task */ while ((step_layout->tasks[i] < cpus[i]) && (task_id < step_layout->task_cnt)) { step_layout->tasks[i]++; task_id++; } } /* Pass 3: Spread remaining tasks across all nodes */ while (task_id < step_layout->task_cnt) { for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { step_layout->tasks[i]++; task_id++; } } } else { /* To effectively deal with heterogeneous nodes, we fake a * cyclic distribution to determine how many tasks go on each * node and then make those assignments in a block fashion. */ bool over_subscribe = false; for (j = 0; task_id < step_layout->task_cnt; j++) { bool space_remaining = false; for (i = 0; ((i < step_layout->node_cnt) && (task_id < step_layout->task_cnt)); i++) { if ((j < cpus[i]) || over_subscribe) { step_layout->tasks[i]++; task_id++; if ((j + 1) < cpus[i]) space_remaining = true; } } if (!space_remaining) over_subscribe = true; } } /* Now distribute the tasks */ task_id = 0; for (i = 0; i < step_layout->node_cnt; i++) { step_layout->tids[i] = xmalloc(sizeof(uint32_t) * step_layout->tasks[i]); for (j = 0; j < step_layout->tasks[i]; j++) { step_layout->tids[i][j] = task_id; task_id++; } } return SLURM_SUCCESS; }
/* * scontrol_parse_res_options parse options for creating or updating a reservation * IN argc - count of arguments * IN argv - list of arguments * IN msg - a string to append to any error message * OUT resv_msg_ptr - struct holding reservation parameters * OUT free_user_str - bool indicating that resv_msg_ptr->users should be freed * OUT free_acct_str - bool indicating that resv_msg_ptr->accounts should be * freed * RET 0 on success, -1 on err and prints message */ extern int scontrol_parse_res_options(int argc, char *argv[], const char *msg, resv_desc_msg_t *resv_msg_ptr, int *free_user_str, int *free_acct_str) { int i; int duration = -3; /* -1 == INFINITE, -2 == error, -3 == not set */ *free_user_str = 0; *free_acct_str = 0; for (i=0; i<argc; i++) { char *tag = argv[i]; int taglen = 0; char plus_minus = '\0'; char *val = strchr(argv[i], '='); taglen = val - argv[i]; if (!val && strncasecmp(argv[i], "res", 3) == 0) { continue; } else if (!val || taglen == 0) { exit_code = 1; error("Unknown parameter %s. %s", argv[i], msg); return -1; } if (val[-1] == '+' || val[-1] == '-') { plus_minus = val[-1]; taglen--; } val++; if (strncasecmp(tag, "ReservationName", MAX(taglen, 1)) == 0) { resv_msg_ptr->name = val; } else if (strncasecmp(tag, "Accounts", MAX(taglen, 1)) == 0) { if (plus_minus) { resv_msg_ptr->accounts = _process_plus_minus(plus_minus, val); *free_acct_str = 1; } else { resv_msg_ptr->accounts = val; } } else if (strncasecmp(tag, "BurstBuffer", MAX(taglen, 2)) == 0) { resv_msg_ptr->burst_buffer = val; } else if (strncasecmp(tag, "StartTime", MAX(taglen, 1)) == 0){ time_t t = parse_time(val, 0); if (errno == ESLURM_INVALID_TIME_VALUE) { exit_code = 1; error("Invalid start time %s. %s", argv[i], msg); return -1; } resv_msg_ptr->start_time = t; } else if (strncasecmp(tag, "EndTime", MAX(taglen, 1)) == 0) { time_t t = parse_time(val, 0); if (errno == ESLURM_INVALID_TIME_VALUE) { exit_code = 1; error("Invalid end time %s. %s", argv[i],msg); return -1; } resv_msg_ptr->end_time = t; } else if (strncasecmp(tag, "Duration", MAX(taglen, 1)) == 0) { /* -1 == INFINITE, -2 == error, -3 == not set */ duration = time_str2mins(val); if (duration < 0 && duration != INFINITE) { exit_code = 1; error("Invalid duration %s. %s", argv[i],msg); return -1; } resv_msg_ptr->duration = (uint32_t)duration; } else if (strncasecmp(tag, "Flags", MAX(taglen, 2)) == 0) { uint32_t f; if (plus_minus) { char *tmp = _process_plus_minus(plus_minus, val); f = parse_resv_flags(tmp, msg); xfree(tmp); } else { f = parse_resv_flags(val, msg); } if (f == 0xffffffff) { return -1; } else { resv_msg_ptr->flags = f; } } else if (strncasecmp(tag, "NodeCnt", MAX(taglen,5)) == 0 || strncasecmp(tag, "NodeCount", MAX(taglen,5)) == 0) { char *endptr = NULL, *node_cnt, *tok, *ptrptr = NULL; int node_inx = 0; node_cnt = xstrdup(val); tok = strtok_r(node_cnt, ",", &ptrptr); while (tok) { xrealloc(resv_msg_ptr->node_cnt, sizeof(uint32_t) * (node_inx + 2)); resv_msg_ptr->node_cnt[node_inx] = strtol(tok, &endptr, 10); if ((endptr != NULL) && ((endptr[0] == 'k') || (endptr[0] == 'K'))) { resv_msg_ptr->node_cnt[node_inx] *= 1024; } else if ((endptr != NULL) && ((endptr[0] == 'm') || (endptr[0] == 'M'))) { resv_msg_ptr->node_cnt[node_inx] *= 1024 * 1024; } else if ((endptr == NULL) || (endptr[0] != '\0') || (tok[0] == '\0')) { exit_code = 1; error("Invalid node count %s. %s", argv[i], msg); xfree(node_cnt); return -1; } node_inx++; tok = strtok_r(NULL, ",", &ptrptr); } xfree(node_cnt); } else if (strncasecmp(tag, "CoreCnt", MAX(taglen,5)) == 0 || strncasecmp(tag, "CoreCount", MAX(taglen,5)) == 0 || strncasecmp(tag, "CPUCnt", MAX(taglen,5)) == 0 || strncasecmp(tag, "CPUCount", MAX(taglen,5)) == 0) { char *endptr = NULL, *core_cnt, *tok, *ptrptr = NULL; char *type; int node_inx = 0; type = slurm_get_select_type(); if (strcasestr(type, "cray")) { int param; param = slurm_get_select_type_param(); if (! (param & CR_OTHER_CONS_RES)) { error("CoreCnt or CPUCnt is only " "suported when " "SelectTypeParameters " "includes OTHER_CONS_RES"); xfree(type); return -1; } } else { if (strcasestr(type, "cons_res") == NULL) { error("CoreCnt or CPUCnt is only " "suported when " "SelectType includes " "select/cons_res"); xfree(type); return -1; } } xfree(type); core_cnt = xstrdup(val); tok = strtok_r(core_cnt, ",", &ptrptr); while (tok) { xrealloc(resv_msg_ptr->core_cnt, sizeof(uint32_t) * (node_inx + 2)); resv_msg_ptr->core_cnt[node_inx] = strtol(tok, &endptr, 10); if ((endptr == NULL) || (endptr[0] != '\0') || (tok[0] == '\0')) { exit_code = 1; error("Invalid core count %s. %s", argv[i], msg); xfree(core_cnt); return -1; } node_inx++; tok = strtok_r(NULL, ",", &ptrptr); } xfree(core_cnt); } else if (strncasecmp(tag, "Nodes", MAX(taglen, 5)) == 0) { resv_msg_ptr->node_list = val; } else if (strncasecmp(tag, "Features", MAX(taglen, 2)) == 0) { resv_msg_ptr->features = val; } else if (strncasecmp(tag, "Licenses", MAX(taglen, 2)) == 0) { resv_msg_ptr->licenses = val; } else if (strncasecmp(tag, "PartitionName", MAX(taglen, 1)) == 0) { resv_msg_ptr->partition = val; } else if (strncasecmp(tag, "Users", MAX(taglen, 1)) == 0) { if (plus_minus) { resv_msg_ptr->users = _process_plus_minus(plus_minus, val); *free_user_str = 1; } else { resv_msg_ptr->users = val; } } else if (strncasecmp(tag, "Watts", MAX(taglen, 1)) == 0) { if (parse_uint32(val, &(resv_msg_ptr->resv_watts))) { error("Invalid Watts value: %s", val); return -1; } } else if (strncasecmp(tag, "res", 3) == 0) { continue; } else { exit_code = 1; error("Unknown parameter %s. %s", argv[i], msg); return -1; } } return 0; }
/* * lllp_distribution * * Note: lllp stands for Lowest Level of Logical Processors. * * When automatic binding is enabled: * - no binding flags set >= CPU_BIND_NONE, and * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} * Otherwise limit job step to the allocated CPUs * * generate the appropriate cpu_bind type and string which results in * the specified lllp distribution. * * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) * IN- global task id array */ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) { int rc = SLURM_SUCCESS; bitstr_t **masks = NULL; char buf_type[100]; int maxtasks = req->tasks_to_launch[(int)node_id]; int whole_nodes, whole_sockets, whole_cores, whole_threads; int part_sockets, part_cores; const uint32_t *gtid = req->global_task_ids[(int)node_id]; static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK | CPU_BIND_RANK | CPU_BIND_MAP | CPU_BIND_LDMASK | CPU_BIND_LDRANK | CPU_BIND_LDMAP; if (req->cpu_bind_type & bind_mode) { /* Explicit step binding specified by user */ char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if ((whole_nodes == 0) && avail_mask && (req->job_core_spec == 0)) { info("task/affinity: entire node must be allocated, " "disabling affinity"); xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } else { if (req->job_core_spec) { if (req->cpu_bind_type & CPU_BIND_MASK) _validate_mask(req, avail_mask); else if (req->cpu_bind_type & CPU_BIND_MAP) _validate_map(req, avail_mask); } xfree(avail_mask); } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] manual binding: %s", req->job_id, buf_type); return; } if (!(req->cpu_bind_type & bind_entity)) { /* No bind unit (sockets, cores) specified by user, * pick something reasonable */ int max_tasks = req->tasks_to_launch[(int)node_id]; char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); debug("binding tasks:%d to " "nodes:%d sockets:%d:%d cores:%d:%d threads:%d", max_tasks, whole_nodes, whole_sockets ,part_sockets, whole_cores, part_cores, whole_threads); if ((max_tasks == whole_sockets) && (part_sockets == 0)) { req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if ((max_tasks == whole_cores) && (part_cores == 0)) { req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } if (max_tasks == whole_threads) { req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] auto binding off: %s", req->job_id, buf_type); return; make_auto: xfree(avail_mask); slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] implicit auto binding: " "%s, dist %d", req->job_id, buf_type, req->task_dist); } else { /* Explicit bind unit (sockets, cores) specified by user */ slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] binding: %s, dist %d", req->job_id, buf_type, req->task_dist); } switch (req->task_dist) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ rc = _task_layout_lllp_block(req, node_id, &masks); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { rc = _task_layout_lllp_block(req, node_id, &masks); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: rc = _task_layout_lllp_cyclic(req, node_id, &masks); break; } /* FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & * max_cores - does select/cons_res plugin allocate whole * socket??? Maybe not. Check srun man page. */ if (rc == SLURM_SUCCESS) { _task_layout_display_masks(req, gtid, maxtasks, masks); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); } #endif /* convert masks into cpu_bind mask string */ _lllp_generate_cpu_bind(req, maxtasks, masks); } else { char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); error("lllp_distribution jobid [%u] overriding binding: %s", req->job_id, buf_type); error("Verify socket/core/thread counts in configuration"); } if (masks) _lllp_free_masks(maxtasks, masks); }