/* Determine which CPUs a job step can use. * OUT whole_<entity>_count - returns count of whole <entities> in this * allocation for this node * OUT part__<entity>_count - returns count of partial <entities> in this * allocation for this node * RET - a string representation of the available mask or NULL on error * NOTE: Caller must xfree() the return value. */ static char *_alloc_mask(launch_tasks_request_msg_t *req, int *whole_node_cnt, int *whole_socket_cnt, int *whole_core_cnt, int *whole_thread_cnt, int *part_socket_cnt, int *part_core_cnt) { uint16_t sockets, cores, threads; int c, s, t, i; int c_miss, s_miss, t_miss, c_hit, t_hit; bitstr_t *alloc_bitmap; char *str_mask; bitstr_t *alloc_mask; *whole_node_cnt = 0; *whole_socket_cnt = 0; *whole_core_cnt = 0; *whole_thread_cnt = 0; *part_socket_cnt = 0; *part_core_cnt = 0; alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads); if (!alloc_bitmap) return NULL; alloc_mask = bit_alloc(bit_size(alloc_bitmap)); i = 0; for (s=0, s_miss=false; s<sockets; s++) { for (c=0, c_hit=c_miss=false; c<cores; c++) { for (t=0, t_hit=t_miss=false; t<threads; t++) { /* If we are pretending we have a larger system than we really have this is needed to make sure we don't bust the bank. */ if (i >= bit_size(alloc_bitmap)) i = 0; if (bit_test(alloc_bitmap, i)) { bit_set(alloc_mask, i); (*whole_thread_cnt)++; t_hit = true; c_hit = true; } else t_miss = true; i++; } if (!t_miss) (*whole_core_cnt)++; else { if (t_hit) (*part_core_cnt)++; c_miss = true; } } if (!c_miss) (*whole_socket_cnt)++; else { if (c_hit) (*part_socket_cnt)++; s_miss = true; } } if (!s_miss) (*whole_node_cnt)++; FREE_NULL_BITMAP(alloc_bitmap); if ((req->job_core_spec != (uint16_t) NO_VAL) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { int spec_thread_cnt; spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); for (t = threads - 1; ((t > 0) && (spec_thread_cnt > 0)); t--) { for (c = cores - 1; ((c > 0) && (spec_thread_cnt > 0)); c--) { for (s = sockets - 1; ((s >= 0) && (spec_thread_cnt > 0)); s--) { i = s * cores + c; i = (i * threads) + t; bit_clear(alloc_mask, i); spec_thread_cnt--; } } } } /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &alloc_mask); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &alloc_mask); } #endif str_mask = bit_fmt_hexmask(alloc_mask); FREE_NULL_BITMAP(alloc_mask); return str_mask; }
/* * batch_bind - Set the batch request message so as to bind the shell to the * proper resources */ void batch_bind(batch_job_launch_msg_t *req) { bitstr_t *req_map, *hw_map; slurm_cred_arg_t arg; uint16_t sockets=0, cores=0, num_cpus; int start, task_cnt=0; if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) { error("task/affinity: job lacks a credential"); return; } start = _get_local_node_info(&arg, 0, &sockets, &cores); if (start != 0) { error("task/affinity: missing node 0 in job credential"); slurm_cred_free_args(&arg); return; } if ((sockets * cores) == 0) { error("task/affinity: socket and core count both zero"); slurm_cred_free_args(&arg); return; } num_cpus = MIN((sockets * cores), (conf->sockets * conf->cores)); req_map = (bitstr_t *) bit_alloc(num_cpus); hw_map = (bitstr_t *) bit_alloc(conf->block_map_size); #ifdef HAVE_FRONT_END { /* Since the front-end nodes are a shared resource, we limit each job * to one CPU based upon monotonically increasing sequence number */ static int last_id = 0; bit_set(hw_map, ((last_id++) % conf->block_map_size)); task_cnt = 1; } #else { char *str; int t, p; /* Transfer core_bitmap data to local req_map. * The MOD function handles the case where fewer processes * physically exist than are configured (slurmd is out of * sync with the slurmctld daemon). */ for (p = 0; p < (sockets * cores); p++) { if (bit_test(arg.job_core_bitmap, p)) bit_set(req_map, (p % num_cpus)); } str = (char *)bit_fmt_hexmask(req_map); debug3("task/affinity: job %u core mask from slurmctld: %s", req->job_id, str); xfree(str); for (p = 0; p < num_cpus; p++) { if (bit_test(req_map, p) == 0) continue; /* core_bitmap does not include threads, so we * add them here but limit them to what the job * requested */ for (t = 0; t < conf->threads; t++) { uint16_t pos = p * conf->threads + t; if (pos >= conf->block_map_size) { info("more resources configured than exist"); p = num_cpus; break; } bit_set(hw_map, pos); task_cnt++; } } } #endif if (task_cnt) { req->cpu_bind_type = CPU_BIND_MASK; if (conf->task_plugin_param & CPU_BIND_VERBOSE) req->cpu_bind_type |= CPU_BIND_VERBOSE; xfree(req->cpu_bind); req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); info("task/affinity: job %u CPU input mask for node: %s", req->job_id, req->cpu_bind); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &hw_map); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &hw_map); } #endif xfree(req->cpu_bind); req->cpu_bind = (char *)bit_fmt_hexmask(hw_map); info("task/affinity: job %u CPU final HW mask for node: %s", req->job_id, req->cpu_bind); } else { error("task/affinity: job %u allocated no CPUs", req->job_id); } FREE_NULL_BITMAP(hw_map); FREE_NULL_BITMAP(req_map); slurm_cred_free_args(&arg); }
/* * lllp_distribution * * Note: lllp stands for Lowest Level of Logical Processors. * * When automatic binding is enabled: * - no binding flags set >= CPU_BIND_NONE, and * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} * Otherwise limit job step to the allocated CPUs * * generate the appropriate cpu_bind type and string which results in * the specified lllp distribution. * * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) * IN- global task id array */ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) { int rc = SLURM_SUCCESS; bitstr_t **masks = NULL; char buf_type[100]; int maxtasks = req->tasks_to_launch[(int)node_id]; int whole_nodes, whole_sockets, whole_cores, whole_threads; int part_sockets, part_cores; const uint32_t *gtid = req->global_task_ids[(int)node_id]; static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK | CPU_BIND_RANK | CPU_BIND_MAP | CPU_BIND_LDMASK | CPU_BIND_LDRANK | CPU_BIND_LDMAP; static int only_one_thread_per_core = -1; if (only_one_thread_per_core == -1) { if (conf->cpus == (conf->sockets * conf->cores)) only_one_thread_per_core = 1; else only_one_thread_per_core = 0; } /* If we are telling the system we only want to use 1 thread * per core with the CPUs node option this is the easiest way * to portray that to the affinity plugin. */ if (only_one_thread_per_core) req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE; if (req->cpu_bind_type & bind_mode) { /* Explicit step binding specified by user */ char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if ((whole_nodes == 0) && avail_mask && (req->job_core_spec == (uint16_t) NO_VAL)) { info("task/affinity: entire node must be allocated, " "disabling affinity"); xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } else { if (req->job_core_spec == (uint16_t) NO_VAL) { if (req->cpu_bind_type & CPU_BIND_MASK) _validate_mask(req, avail_mask); else if (req->cpu_bind_type & CPU_BIND_MAP) _validate_map(req, avail_mask); } xfree(avail_mask); } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] manual binding: %s", req->job_id, buf_type); return; } if (!(req->cpu_bind_type & bind_entity)) { /* No bind unit (sockets, cores) specified by user, * pick something reasonable */ uint32_t task_plugin_param = slurm_get_task_plugin_param(); bool auto_def_set = false; int spec_thread_cnt = 0; int max_tasks = req->tasks_to_launch[(int)node_id] * req->cpus_per_task; char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); debug("binding tasks:%d to " "nodes:%d sockets:%d:%d cores:%d:%d threads:%d", max_tasks, whole_nodes, whole_sockets ,part_sockets, whole_cores, part_cores, whole_threads); if ((req->job_core_spec != (uint16_t) NO_VAL) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); } if (((max_tasks == whole_sockets) && (part_sockets == 0)) || (spec_thread_cnt && (max_tasks == (whole_sockets + part_sockets)))) { req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if (((max_tasks == whole_cores) && (part_cores == 0)) || (spec_thread_cnt && (max_tasks == (whole_cores + part_cores)))) { req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } if (max_tasks == whole_threads) { req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } if (task_plugin_param & CPU_AUTO_BIND_TO_THREADS) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } else if (task_plugin_param & CPU_AUTO_BIND_TO_CORES) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } else if (task_plugin_param & CPU_AUTO_BIND_TO_SOCKETS) { auto_def_set = true; req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] auto binding off: %s", req->job_id, buf_type); return; make_auto: xfree(avail_mask); slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] %s auto binding: " "%s, dist %d", req->job_id, (auto_def_set) ? "default" : "implicit", buf_type, req->task_dist); } else { /* Explicit bind unit (sockets, cores) specified by user */ slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] binding: %s, dist %d", req->job_id, buf_type, req->task_dist); } switch (req->task_dist & SLURM_DIST_STATE_BASE) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ rc = _task_layout_lllp_block(req, node_id, &masks); break; case SLURM_DIST_ARBITRARY: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC: case SLURM_DIST_UNKNOWN: if (slurm_get_select_type_param() & CR_CORE_DEFAULT_DIST_BLOCK) { rc = _task_layout_lllp_block(req, node_id, &masks); break; } /* We want to fall through here if we aren't doing a default dist block. */ default: rc = _task_layout_lllp_cyclic(req, node_id, &masks); break; } /* FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & * max_cores - does select/cons_res plugin allocate whole * socket??? Maybe not. Check srun man page. */ if (rc == SLURM_SUCCESS) { _task_layout_display_masks(req, gtid, maxtasks, masks); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); } #endif /* convert masks into cpu_bind mask string */ _lllp_generate_cpu_bind(req, maxtasks, masks); } else { char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); error("lllp_distribution jobid [%u] overriding binding: %s", req->job_id, buf_type); error("Verify socket/core/thread counts in configuration"); } if (masks) _lllp_free_masks(maxtasks, masks); }
/* * lllp_distribution * * Note: lllp stands for Lowest Level of Logical Processors. * * When automatic binding is enabled: * - no binding flags set >= CPU_BIND_NONE, and * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS} * Otherwise limit job step to the allocated CPUs * * generate the appropriate cpu_bind type and string which results in * the specified lllp distribution. * * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated) * IN- global task id array */ void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id) { int rc = SLURM_SUCCESS; bitstr_t **masks = NULL; char buf_type[100]; int maxtasks = req->tasks_to_launch[(int)node_id]; int whole_nodes, whole_sockets, whole_cores, whole_threads; int part_sockets, part_cores; const uint32_t *gtid = req->global_task_ids[(int)node_id]; static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES | CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS; static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK | CPU_BIND_RANK | CPU_BIND_MAP | CPU_BIND_LDMASK | CPU_BIND_LDRANK | CPU_BIND_LDMAP; if (req->cpu_bind_type & bind_mode) { /* Explicit step binding specified by user */ char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if ((whole_nodes == 0) && avail_mask) { /* Step does NOT have access to whole node, * bind to full mask of available processors */ xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } else { /* Step does have access to whole node, * bind to whatever step wants */ xfree(avail_mask); } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] manual binding: %s", req->job_id, buf_type); return; } if (!(req->cpu_bind_type & bind_entity)) { /* No bind unit (sockets, cores) specified by user, * pick something reasonable */ int max_tasks = req->tasks_to_launch[(int)node_id]; char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); debug("binding tasks:%d to " "nodes:%d sockets:%d:%d cores:%d:%d threads:%d", max_tasks, whole_nodes, whole_sockets ,part_sockets, whole_cores, part_cores, whole_threads); if ((max_tasks == whole_sockets) && (part_sockets == 0)) { req->cpu_bind_type |= CPU_BIND_TO_SOCKETS; goto make_auto; } if ((max_tasks == whole_cores) && (part_cores == 0)) { req->cpu_bind_type |= CPU_BIND_TO_CORES; goto make_auto; } if (max_tasks == whole_threads) { req->cpu_bind_type |= CPU_BIND_TO_THREADS; goto make_auto; } if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] auto binding off: %s", req->job_id, buf_type); return; make_auto: xfree(avail_mask); slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] implicit auto binding: " "%s, dist %d", req->job_id, buf_type, req->task_dist); } else { /* Explicit bind unit (sockets, cores) specified by user */ slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); info("lllp_distribution jobid [%u] binding: %s, dist %d", req->job_id, buf_type, req->task_dist); } switch (req->task_dist) { case SLURM_DIST_BLOCK_BLOCK: case SLURM_DIST_CYCLIC_BLOCK: case SLURM_DIST_PLANE: /* tasks are distributed in blocks within a plane */ rc = _task_layout_lllp_block(req, node_id, &masks); break; case SLURM_DIST_CYCLIC: case SLURM_DIST_BLOCK: case SLURM_DIST_CYCLIC_CYCLIC: case SLURM_DIST_BLOCK_CYCLIC: rc = _task_layout_lllp_cyclic(req, node_id, &masks); break; default: if (req->cpus_per_task > 1) rc = _task_layout_lllp_multi(req, node_id, &masks); else rc = _task_layout_lllp_cyclic(req, node_id, &masks); req->task_dist = SLURM_DIST_BLOCK_CYCLIC; break; } /* FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS & * max_cores - does select/cons_res plugin allocate whole * socket??? Maybe not. Check srun man page. */ if (rc == SLURM_SUCCESS) { _task_layout_display_masks(req, gtid, maxtasks, masks); /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(maxtasks, masks); _task_layout_display_masks(req, gtid, maxtasks, masks); } #endif /* convert masks into cpu_bind mask string */ _lllp_generate_cpu_bind(req, maxtasks, masks); } else { char *avail_mask = _alloc_mask(req, &whole_nodes, &whole_sockets, &whole_cores, &whole_threads, &part_sockets, &part_cores); if (avail_mask) { xfree(req->cpu_bind); req->cpu_bind = avail_mask; req->cpu_bind_type &= (~bind_mode); req->cpu_bind_type |= CPU_BIND_MASK; } slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type); error("lllp_distribution jobid [%u] overriding binding: %s", req->job_id, buf_type); error("Verify socket/core/thread counts in configuration"); } if (masks) _lllp_free_masks(maxtasks, masks); }