/* * _task_layout_lllp_cyclic * * task_layout_lllp_cyclic creates a cyclic distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Cyclic algorithm * is the same as the Cyclic distribution performed in srun. * * Distribution at the lllp: * -m hostfile|block|cyclic:block|cyclic * * The first distribution "hostfile|block|cyclic" is computed * in srun. The second distribution "block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * * If a task asks for more than one CPU per task, put the tasks as * close as possible (fill core rather than going next socket for the * extra task) * */ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int last_taskcount = -1, taskcount = 0; uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0; uint16_t offset = 0, p = 0; int size, max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int *socket_last_pu = NULL; int core_inx, pu_per_core, *core_tasks = NULL; info ("_task_layout_lllp_cyclic "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) return SLURM_ERROR; size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } pu_per_core = hw_threads; core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores); socket_last_pu = xmalloc(hw_sockets * sizeof(int)); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; size = bit_size(avail_map); offset = hw_cores * hw_threads; s = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_cyclic failure"); last_taskcount = taskcount; for (i = 0; i < size; i++) { bool already_switched = false; uint16_t bit; uint16_t orig_s = s; while (socket_last_pu[s] >= offset) { /* Switch to the next socket we have * ran out here. */ /* This only happens if the slurmctld * gave us an allocation that made a * task split sockets. Or if the * entire allocation is on one socket. */ s = (s + 1) % hw_sockets; if (orig_s == s) { /* This should rarely happen, * but is here for sanity sake. */ debug("allocation is full, " "oversubscribing"); memset(core_tasks, 0, (sizeof(int) * hw_sockets * hw_cores)); memset(socket_last_pu, 0, sizeof(hw_sockets * sizeof(int))); } } bit = socket_last_pu[s] + (s * offset); /* In case hardware and config differ */ bit %= size; /* set up for the next one */ socket_last_pu[s]++; /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) socket_last_pu[s] += hw_threads - 1; if (!bit_test(avail_map, bit)) continue; core_inx = bit / pu_per_core; if ((req->ntasks_per_core != 0) && (core_tasks[core_inx] >= req->ntasks_per_core)) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc(conf->block_map_size); //info("setting %d %d", taskcount, bit); bit_set(masks[taskcount], bit); if (!already_switched && (((req->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_CYCLIC_CFULL) || ((req->task_dist & SLURM_DIST_STATE_BASE) == SLURM_DIST_BLOCK_CFULL))) { /* This means we are laying out cpus * within a task cyclically as well. */ s = (s + 1) % hw_sockets; already_switched = true; } if (++p < req->cpus_per_task) continue; core_tasks[core_inx]++; /* Binding to cores, skip remaining of the threads */ if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && ((req->cpu_bind_type & CPU_BIND_TO_CORES) || (req->ntasks_per_core == 1))) { int threads_not_used; if (req->cpus_per_task < hw_threads) threads_not_used = hw_threads - req->cpus_per_task; else threads_not_used = req->cpus_per_task % hw_threads; socket_last_pu[s] += threads_not_used; } p = 0; if (!already_switched) { /* Now that we have finished a task, switch to * the next socket. */ s = (s + 1) % hw_sockets; } if (++taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); xfree(core_tasks); xfree(socket_last_pu); return SLURM_SUCCESS; }
/* Determine which CPUs a job step can use. * OUT whole_<entity>_count - returns count of whole <entities> in this * allocation for this node * OUT part__<entity>_count - returns count of partial <entities> in this * allocation for this node * RET - a string representation of the available mask or NULL on error * NOTE: Caller must xfree() the return value. */ static char *_alloc_mask(launch_tasks_request_msg_t *req, int *whole_node_cnt, int *whole_socket_cnt, int *whole_core_cnt, int *whole_thread_cnt, int *part_socket_cnt, int *part_core_cnt) { uint16_t sockets, cores, threads; int c, s, t, i; int c_miss, s_miss, t_miss, c_hit, t_hit; bitstr_t *alloc_bitmap; char *str_mask; bitstr_t *alloc_mask; *whole_node_cnt = 0; *whole_socket_cnt = 0; *whole_core_cnt = 0; *whole_thread_cnt = 0; *part_socket_cnt = 0; *part_core_cnt = 0; alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads); if (!alloc_bitmap) return NULL; alloc_mask = bit_alloc(bit_size(alloc_bitmap)); i = 0; for (s=0, s_miss=false; s<sockets; s++) { for (c=0, c_hit=c_miss=false; c<cores; c++) { for (t=0, t_hit=t_miss=false; t<threads; t++) { /* If we are pretending we have a larger system than we really have this is needed to make sure we don't bust the bank. */ if (i >= bit_size(alloc_bitmap)) i = 0; if (bit_test(alloc_bitmap, i)) { bit_set(alloc_mask, i); (*whole_thread_cnt)++; t_hit = true; c_hit = true; } else t_miss = true; i++; } if (!t_miss) (*whole_core_cnt)++; else { if (t_hit) (*part_core_cnt)++; c_miss = true; } } if (!c_miss) (*whole_socket_cnt)++; else { if (c_hit) (*part_socket_cnt)++; s_miss = true; } } if (!s_miss) (*whole_node_cnt)++; FREE_NULL_BITMAP(alloc_bitmap); if ((req->job_core_spec != (uint16_t) NO_VAL) && (req->job_core_spec & CORE_SPEC_THREAD) && (req->job_core_spec != CORE_SPEC_THREAD)) { int spec_thread_cnt; spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD); for (t = threads - 1; ((t > 0) && (spec_thread_cnt > 0)); t--) { for (c = cores - 1; ((c > 0) && (spec_thread_cnt > 0)); c--) { for (s = sockets - 1; ((s >= 0) && (spec_thread_cnt > 0)); s--) { i = s * cores + c; i = (i * threads) + t; bit_clear(alloc_mask, i); spec_thread_cnt--; } } } } /* translate abstract masks to actual hardware layout */ _lllp_map_abstract_masks(1, &alloc_mask); #ifdef HAVE_NUMA if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) { _match_masks_to_ldom(1, &alloc_mask); } #endif str_mask = bit_fmt_hexmask(alloc_mask); FREE_NULL_BITMAP(alloc_mask); return str_mask; }
/* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; int core_inx, pu_per_core, *core_tasks = NULL; int sock_inx, pu_per_socket, *socket_tasks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && (max_cpus > (hw_sockets * hw_cores))) { /* More CPUs requested than available cores, * disable core-level binding */ req->cpu_bind_type &= (~CPU_BIND_ONE_THREAD_PER_CORE); } *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; pu_per_core = hw_threads; core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores); pu_per_socket = hw_cores * hw_threads; socket_tasks = xmalloc(sizeof(int) * hw_sockets); /* block distribution with oversubsciption */ c = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_block infinite loop"); if (taskcount > 0) { /* Clear counters to over-subscribe, if necessary */ memset(core_tasks, 0, (sizeof(int) * hw_sockets * hw_cores)); memset(socket_tasks, 0, (sizeof(int) * hw_sockets)); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; core_inx = i / pu_per_core; if ((req->ntasks_per_core != 0) && (core_tasks[core_inx] >= req->ntasks_per_core)) continue; sock_inx = i / pu_per_socket; if ((req->ntasks_per_socket != 0) && (socket_tasks[sock_inx] >= req->ntasks_per_socket)) continue; socket_tasks[sock_inx]++; if (!masks[taskcount]) masks[taskcount] = bit_alloc( conf->block_map_size); //info("setting %d %d", taskcount, i); bit_set(masks[taskcount], i); /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) i += hw_threads - 1; if (++c < req->cpus_per_task) continue; core_tasks[core_inx]++; /* Binding to cores, skip remaining of the threads */ if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) && ((req->cpu_bind_type & CPU_BIND_TO_CORES) || (req->ntasks_per_core == 1))) { int threads_not_used; if (req->cpus_per_task < hw_threads) threads_not_used = hw_threads - req->cpus_per_task; else threads_not_used = req->cpus_per_task % hw_threads; i += threads_not_used; } c = 0; if (++taskcount >= max_tasks) break; } } xfree(core_tasks); xfree(socket_tasks); /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
/* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, j, t, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; int *task_array; bitstr_t *avail_map; bitstr_t **masks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; task_array = xmalloc(size * sizeof(int)); if (!task_array) { error("In lllp_block: task_array memory error"); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } /* block distribution with oversubsciption */ c = 0; while(taskcount < max_tasks) { if (taskcount == last_taskcount) { fatal("_task_layout_lllp_block infinite loop"); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unrequested threads */ if (i%hw_threads >= hw_threads) continue; /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; /* if multiple CPUs per task, only * count the task on the first CPU */ if (c == 0) task_array[i] += 1; if (++c < req->cpus_per_task) continue; c = 0; if (++taskcount >= max_tasks) break; } } /* Distribute the tasks and create per-task masks that only * contain the first CPU. Note that unused resources * (task_array[i] == 0) will get skipped */ taskcount = 0; for (i = 0; i < size; i++) { for (t = 0; t < task_array[i]; t++) { if (masks[taskcount] == NULL) masks[taskcount] = (bitstr_t *)bit_alloc(conf->block_map_size); bit_set(masks[taskcount++], i); } } /* now set additional CPUs for cpus_per_task > 1 */ for (t=0; t<max_tasks && req->cpus_per_task>1; t++) { if (!masks[t]) continue; c = 0; for (i = 0; i < size && c<req->cpus_per_task; i++) { if (bit_test(masks[t], i) == 0) continue; for (j=i+1,c=1; j<size && c<req->cpus_per_task;j++) { if (bit_test(avail_map, j) == 0) continue; bit_set(masks[t], j); c++; } if (c < req->cpus_per_task) { /* we haven't found all of the CPUs for this * task, so we'll wrap the search to cover the * whole node */ for (j=0; j<i && c<req->cpus_per_task; j++) { if (bit_test(avail_map, j) == 0) continue; bit_set(masks[t], j); c++; } } } } xfree(task_array); /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
/* * _task_layout_lllp_cyclic * * task_layout_lllp_cyclic creates a cyclic distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Cyclic algorithm * is the same as the Cyclic distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int last_taskcount = -1, taskcount = 0; uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0; int size, max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; int avail_size; bitstr_t *avail_map; bitstr_t **masks = NULL; info ("_task_layout_lllp_cyclic "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) return SLURM_ERROR; avail_size = bit_size(avail_map); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } i = 0; while (taskcount < max_tasks) { if (taskcount == last_taskcount) fatal("_task_layout_lllp_cyclic failure"); last_taskcount = taskcount; for (t = 0; t < hw_threads; t++) { for (c = 0; c < hw_cores; c++) { for (s = 0; s < hw_sockets; s++) { uint16_t bit = s*(hw_cores*hw_threads) + c*(hw_threads) + t; /* In case hardware and config differ */ bit %= avail_size; if (bit_test(avail_map, bit) == 0) continue; if (masks[taskcount] == NULL) { masks[taskcount] = (bitstr_t *) bit_alloc(conf-> block_map_size); } bit_set(masks[taskcount], bit); if (++i < req->cpus_per_task) continue; i = 0; if (++taskcount >= max_tasks) break; } if (taskcount >= max_tasks) break; } if (taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }
/* * _task_layout_lllp_block * * task_layout_lllp_block will create a block distribution at the * lowest level of logical processor which is either socket, core or * thread depending on the system architecture. The Block algorithm * is the same as the Block distribution performed in srun. * * Distribution at the lllp: * -m hostfile|plane|block|cyclic:block|cyclic * * The first distribution "hostfile|plane|block|cyclic" is computed * in srun. The second distribution "plane|block|cyclic" is computed * locally by each slurmd. * * The input to the lllp distribution algorithms is the gids (tasks * ids) generated for the local node. * * The output is a mapping of the gids onto logical processors * (thread/core/socket) with is expressed cpu_bind masks. * */ static int _task_layout_lllp_block(launch_tasks_request_msg_t *req, uint32_t node_id, bitstr_t ***masks_p) { int c, i, size, last_taskcount = -1, taskcount = 0; uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0; int max_tasks = req->tasks_to_launch[(int)node_id]; int max_cpus = max_tasks * req->cpus_per_task; bitstr_t *avail_map; bitstr_t **masks = NULL; info("_task_layout_lllp_block "); avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads); if (!avail_map) { return SLURM_ERROR; } size = bit_set_count(avail_map); if (size < max_tasks) { error("task/affinity: only %d bits in avail_map for %d tasks!", size, max_tasks); FREE_NULL_BITMAP(avail_map); return SLURM_ERROR; } if (size < max_cpus) { /* Possible result of overcommit */ i = size / max_tasks; info("task/affinity: reset cpus_per_task from %d to %d", req->cpus_per_task, i); req->cpus_per_task = i; } size = bit_size(avail_map); *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*)); masks = *masks_p; /* block distribution with oversubsciption */ c = 0; while(taskcount < max_tasks) { if (taskcount == last_taskcount) { fatal("_task_layout_lllp_block infinite loop"); } last_taskcount = taskcount; /* the abstract map is already laid out in block order, * so just iterate over it */ for (i = 0; i < size; i++) { /* skip unavailable resources */ if (bit_test(avail_map, i) == 0) continue; if (!masks[taskcount]) masks[taskcount] = bit_alloc( conf->block_map_size); //info("setting %d %d", taskcount, i); bit_set(masks[taskcount], i); /* skip unrequested threads */ if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) i += hw_threads-1; if (++c < req->cpus_per_task) continue; c = 0; if (++taskcount >= max_tasks) break; } } /* last step: expand the masks to bind each task * to the requested resource */ _expand_masks(req->cpu_bind_type, max_tasks, masks, hw_sockets, hw_cores, hw_threads, avail_map); FREE_NULL_BITMAP(avail_map); return SLURM_SUCCESS; }