/* cr_job_test - does most of the real work for select_p_job_test(), which * includes contiguous selection, load-leveling and max_share logic * * PROCEDURE: * * Step 1: compare nodes in "avail" bitmap with current node state data * to find available nodes that match the job request * * Step 2: check resources in "avail" bitmap with allocated resources from * higher priority partitions (busy resources are UNavailable) * * Step 3: select resource usage on remaining resources in "avail" bitmap * for this job, with the placement influenced by existing * allocations */ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, struct node_use_record *node_usage) { static int gang_mode = -1; int error_code = SLURM_SUCCESS; bitstr_t *orig_map, *avail_cores, *free_cores; bool test_only; uint32_t c, i, j, k, csize; uint64_t save_mem = 0; int n; job_resources_t *job_res; struct job_details *details_ptr; struct part_res_record *p_ptr, *jp_ptr; uint16_t *cpu_count; if (gang_mode == -1) { if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) gang_mode = 1; else gang_mode = 0; } details_ptr = job_ptr->details; free_job_resources(&job_ptr->job_resrcs); if (mode == SELECT_MODE_TEST_ONLY) test_only = true; else /* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN */ test_only = false; /* check node_state and update the node bitmap as necessary */ if (!test_only) { error_code = _verify_node_state(cr_part_ptr, job_ptr, bitmap, cr_type, node_usage, job_node_req); if (error_code != SLURM_SUCCESS) return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: evaluating %pJ on %u nodes", job_ptr, bit_set_count(bitmap)); } orig_map = bit_copy(bitmap); avail_cores = _make_core_bitmap(bitmap); /* test to make sure that this job can succeed with all avail_cores * if 'no' then return FAIL * if 'yes' then we will seek the optimal placement for this job * within avail_cores */ free_cores = bit_copy(avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count == NULL) { /* job cannot fit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 fail: " "insufficient resources"); } return SLURM_ERROR; } else if (test_only) { FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("select/serial: cr_job_test: test 0 pass: "******"test_only"); return SLURM_SUCCESS; } if (cr_type == CR_MEMORY) { /* CR_MEMORY does not care about existing CPU allocations, * so we can jump right to job allocation from here */ goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 pass - " "job fits on given resources"); } /* now that we know that this job can run with the given resources, * let's factor in the existing allocations and seek the optimal set * of resources for this job. Here is the procedure: * * Step 1: Seek idle CPUs across all partitions. If successful then * place job and exit. If not successful, then continue. Two * related items to note: * 1. Jobs that don't share CPUs finish with step 1. * 2. The remaining steps assume sharing or preemption. * * Step 2: Remove resources that are in use by higher-priority * partitions, and test that job can still succeed. If not * then exit. * * Step 3: Seek idle nodes among the partitions with the same * priority as the job's partition. If successful then * goto Step 6. If not then continue: * * Step 4: Seek placement within the job's partition. Search * row-by-row. If no placement if found, then exit. If a row * is found, then continue: * * Step 5: Place job and exit. FIXME! Here is where we need a * placement algorithm that recognizes existing job * boundaries and tries to "overlap jobs" as efficiently * as possible. * * Step 6: Place job and exit. FIXME! here is we use a placement * algorithm similar to Step 5 on jobs from lower-priority * partitions. */ /*** Step 1 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove all existing allocations from free_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_and_not(free_cores, p_ptr->row[i].row_bitmap); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 pass - " "idle resources found"); } goto alloc_job; } if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) { /* This job CANNOT share CPUs regardless of priority, * so we fail here. Note that Shared=EXCLUSIVE was already * addressed in _verify_node_state() and job preemption * removes jobs from simulated resource allocation map * before this point. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "no idle resources available"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "not enough idle resources"); } /*** Step 2 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; } if (!jp_ptr) { fatal("select/serial: could not find partition for %pJ", job_ptr); return SLURM_ERROR; /* Fix CLANG false positive */ } /* remove existing allocations (jobs) from higher-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if ((p_ptr->part_ptr->priority_tier <= jp_ptr->part_ptr->priority_tier) && (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF)) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_and_not(free_cores, p_ptr->row[i].row_bitmap); } } /* make these changes permanent */ bit_copybits(avail_cores, free_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (!cpu_count) { /* job needs resources that are currently in use by * higher-priority jobs, so fail for now */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 fail - " "resources busy with higher priority jobs"); } goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 pass - " "available resources for this priority"); } /*** Step 3 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove existing allocations (jobs) from same-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority_tier != jp_ptr->part_ptr->priority_tier) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_and_not(free_cores, p_ptr->row[i].row_bitmap); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* jobs from low-priority partitions are the only thing left * in our way. for now we'll ignore them, but FIXME: we need * a good placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and the low-priority * jobs */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 pass - " "found resources"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 fail - " "not enough idle resources in same priority"); } /*** Step 4 ***/ /* try to fit the job into an existing row * * free_cores = core_bitmap to be built * avail_cores = static core_bitmap of all available cores */ if (!jp_ptr || !jp_ptr->row) { /* there's no existing jobs in this partition, so place * the job in avail_cores. FIXME: still need a good * placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and existing * jobs in the other partitions with <= priority to * this partition */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 pass - " "first row found"); } goto alloc_job; } cr_sort_part_rows(jp_ptr); c = jp_ptr->num_rows; if (job_node_req != NODE_CR_AVAILABLE) c = 1; for (i = 0; i < c; i++) { if (!jp_ptr->row[i].row_bitmap) break; bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); bit_and_not(free_cores, jp_ptr->row[i].row_bitmap); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 pass - row %i", i); } break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 fail - row %i", i); } } if ((i < c) && !jp_ptr->row[i].row_bitmap) { /* we've found an empty row, so use it */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 trying empty row %i",i); } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); } if (!cpu_count) { /* job can't fit into any row, so exit */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 fail - " "busy partition"); } goto alloc_job; } /*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 *** * Note that while the job may have fit into a row, it should * still be run through a good placement algorithm here that * optimizes "job overlap" between this job (in these idle nodes) * and existing jobs in the other partitions with <= priority to * this partition */ alloc_job: /* at this point we've found a good set of * bits to allocate to this job: * - bitmap is the set of nodes to allocate * - free_cores is the set of allocated cores * - cpu_count is the number of cpus per allocated node * * Next steps are to cleanup the worker variables, * create the job_resources struct, * distribute the job on the bits, and exit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(avail_cores); if (!cpu_count) { /* we were sent here to cleanup and exit */ FREE_NULL_BITMAP(free_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: exiting cr_job_test with no " "allocation"); } return SLURM_ERROR; } /* At this point we have: * - a bitmap of selected nodes * - a free_cores bitmap of usable cores on each selected node * - a per-alloc-node cpu_count array */ if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL)) error_code = EINVAL; if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) job_ptr->total_cpus = 1; if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } n = bit_ffs(bitmap); if (n < 0) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: distributing %pJ", job_ptr); } /** create the struct_job_res **/ job_res = create_job_resources(); job_res->node_bitmap = bit_copy(bitmap); job_res->nodes = bitmap2node_name(bitmap); job_res->nhosts = bit_set_count(bitmap); job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->ncpus *= details_ptr->ntasks_per_node; job_res->ncpus = MAX(job_res->ncpus, details_ptr->min_cpus); job_res->ncpus = MAX(job_res->ncpus, details_ptr->pn_min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->memory_allocated = xmalloc(job_res->nhosts * sizeof(uint64_t)); job_res->memory_used = xmalloc(job_res->nhosts * sizeof(uint64_t)); /* store the hardware data for the selected nodes */ error_code = build_job_resources(job_res, node_record_table_ptr, select_fast_schedule); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return error_code; } c = 0; csize = bit_size(job_res->core_bitmap); j = cr_get_coremap_offset(n); k = cr_get_coremap_offset(n + 1); for (; j < k; j++, c++) { if (!bit_test(free_cores, j)) continue; if (c >= csize) { error("select/serial: cr_job_test " "core_bitmap index error on node %s", select_node_record[n].node_ptr->name); drain_nodes(select_node_record[n].node_ptr->name, "Bad core count", getuid()); free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return SLURM_ERROR; } bit_set(job_res->core_bitmap, c); break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: %pJ ncpus %u cbits %u/%d nbits %u", job_ptr, job_res->ncpus, bit_set_count(free_cores), 1, job_res->nhosts); } FREE_NULL_BITMAP(free_cores); /* distribute the tasks and clear any unused cores */ job_ptr->job_resrcs = job_res; error_code = cr_dist(job_ptr, cr_type); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_ptr->job_resrcs); return error_code; } /* translate job_res->cpus array into format with rep count */ job_ptr->total_cpus = build_job_resources_cpu_array(job_res); if (!(cr_type & CR_MEMORY)) return error_code; /* load memory allocated array */ save_mem = details_ptr->pn_min_memory; if (save_mem & MEM_PER_CPU) { /* memory is per-cpu */ save_mem &= (~MEM_PER_CPU); job_res->memory_allocated[0] = job_res->cpus[0] * save_mem; } else { /* memory is per-node */ job_res->memory_allocated[0] = save_mem; } return error_code; }
/* * route_p_split_hostlist - logic to split an input hostlist into * a set of hostlists to forward to. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return; * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_p_split_hostlist(hostlist_t hl, hostlist_t** sp_hl, int* count, uint16_t tree_width) { int i, j, k, hl_ndx, msg_count, sw_count, lst_count; char *buf; bitstr_t *nodes_bitmap = NULL; /* nodes in message list */ bitstr_t *fwd_bitmap = NULL; /* nodes in forward list */ msg_count = hostlist_count(hl); slurm_mutex_lock(&route_lock); if (switch_record_cnt == 0) { /* configs have not already been processed */ slurm_conf_init(NULL); if (init_node_conf()) { fatal("ROUTE: Failed to init slurm config"); } if (build_all_nodeline_info(false, 0)) { fatal("ROUTE: Failed to build node config"); } rehash_node(); if (slurm_topo_build_config() != SLURM_SUCCESS) { fatal("ROUTE: Failed to build topology config"); } } slurm_mutex_unlock(&route_lock); *sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t)); /* create bitmap of nodes to send message too */ if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) { buf = hostlist_ranged_string_xmalloc(hl); fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf); } /* Find lowest level switch containing all the nodes in the list */ j = 0; for (i = 0; i <= switch_levels; i++) { for (j=0; j<switch_record_cnt; j++) { if (switch_record_table[j].level == i) { if (bit_super_set(nodes_bitmap, switch_record_table[j]. node_bitmap)) { /* All nodes in message list are in * this switch */ break; } } } if (j < switch_record_cnt) { /* Got here via break after bit_super_set */ break; // 'j' is our switch } /* else, no switches at this level reach all nodes */ } if (i > switch_levels) { /* This can only happen if trying to schedule multiple physical * clusters as a single logical cluster under the control of a * single slurmctld daemon, and sending something like a * node_registation request to all nodes. * Revert to default behavior*/ if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc(hl); debug("ROUTE: didn't find switch containing nodes=%s", buf); xfree(buf); } FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth( hl, sp_hl, count, tree_width); } if (switch_record_table[j].level == 0) { /* This is a leaf switch. Construct list based on TreeWidth */ FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth( hl, sp_hl, count, tree_width); } /* loop through children, construction a hostlist for each child switch * with nodes in the message list */ hl_ndx = 0; lst_count = 0; for (i=0; i < switch_record_table[j].num_switches; i++) { k = switch_record_table[j].switch_index[i]; fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap); bit_and(fwd_bitmap, nodes_bitmap); sw_count = bit_set_count(fwd_bitmap); if (sw_count == 0) { continue; /* no nodes on this switch in message list */ } (*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap); /* Now remove nodes from this switch from message list */ bit_and_not(nodes_bitmap, fwd_bitmap); FREE_NULL_BITMAP(fwd_bitmap); if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]); debug("ROUTE: ... sublist[%d] switch=%s :: %s", i, switch_record_table[i].name, buf); xfree(buf); } hl_ndx++; lst_count += sw_count; if (lst_count == msg_count) break; /* all nodes in message are in a child list */ } FREE_NULL_BITMAP(nodes_bitmap); *count = hl_ndx; return SLURM_SUCCESS; }