static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t node_cnt) { int i; uint32_t total_cpus = 0; job_resources_t *job_resrcs_ptr; xassert(job_ptr); if (job_ptr->job_resrcs) { error("select_p_job_test: already have select_job"); free_job_resources(&job_ptr->job_resrcs); } job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources(); job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t)); job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t)); job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt); job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt); /* job_resrcs_ptr->nhosts = node_cnt; */ job_resrcs_ptr->nhosts = bit_set_count(bitmap); job_resrcs_ptr->ncpus = job_ptr->details->min_cpus; job_resrcs_ptr->node_bitmap = bit_copy(bitmap); job_resrcs_ptr->nodes = bitmap2node_name(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_resrcs_ptr->cpu_array_cnt = 1; if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp) job_resrcs_ptr->cpu_array_value[0] = job_ptr->details->min_cpus; else job_resrcs_ptr->cpu_array_value[0] = bg_conf->cpus_per_mp; job_resrcs_ptr->cpu_array_reps[0] = node_cnt; total_cpus = bg_conf->cpu_ratio * node_cnt; for (i=0; i<node_cnt; i++) job_resrcs_ptr->cpus[i] = bg_conf->cpu_ratio; if (job_resrcs_ptr->ncpus != total_cpus) { error("select_p_job_test: ncpus mismatch %u != %u", job_resrcs_ptr->ncpus, total_cpus); } }
extern int unpack_job_resources(job_resources_t **job_resrcs_pptr, Buf buffer, uint16_t protocol_version) { char *bit_fmt = NULL; uint32_t empty, tmp32; job_resources_t *job_resrcs; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); xassert(job_resrcs_pptr); if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) { safe_unpack32(&empty, buffer); if (empty == NO_VAL) { *job_resrcs_pptr = NULL; return SLURM_SUCCESS; } job_resrcs = xmalloc(sizeof(struct job_resources)); job_resrcs->nhosts = empty; safe_unpack32(&job_resrcs->ncpus, buffer); safe_unpack32(&job_resrcs->node_req, buffer); safe_unpackstr_xmalloc(&job_resrcs->nodes, &tmp32, buffer); safe_unpack32_array(&job_resrcs->cpu_array_reps, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpu_array_reps); job_resrcs->cpu_array_cnt = tmp32; safe_unpack16_array(&job_resrcs->cpu_array_value, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpu_array_value); if (tmp32 != job_resrcs->cpu_array_cnt) goto unpack_error; safe_unpack16_array(&job_resrcs->cpus, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpus); if (tmp32 != job_resrcs->nhosts) goto unpack_error; safe_unpack16_array(&job_resrcs->cpus_used, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpus_used); safe_unpack32_array(&job_resrcs->memory_allocated, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->memory_allocated); safe_unpack32_array(&job_resrcs->memory_used, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->memory_used); if (!(cluster_flags & CLUSTER_FLAG_BG)) { safe_unpack16_array(&job_resrcs->sockets_per_node, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->sockets_per_node); safe_unpack16_array(&job_resrcs->cores_per_socket, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cores_per_socket); safe_unpack32_array(&job_resrcs->sock_core_rep_count, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->sock_core_rep_count); unpack_bit_str(&job_resrcs->core_bitmap, buffer); unpack_bit_str(&job_resrcs->core_bitmap_used, buffer); } } else if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) { uint8_t tmp_8; safe_unpack32(&empty, buffer); if (empty == NO_VAL) { *job_resrcs_pptr = NULL; return SLURM_SUCCESS; } job_resrcs = xmalloc(sizeof(struct job_resources)); job_resrcs->nhosts = empty; safe_unpack32(&job_resrcs->ncpus, buffer); safe_unpack8(&tmp_8, buffer); if (tmp_8 < 100) /* Not NODE_CR_RESERVED */ job_resrcs->node_req = tmp_8; /* 32-bit in v2.5 */ else job_resrcs->node_req = NODE_CR_RESERVED; safe_unpackstr_xmalloc(&job_resrcs->nodes, &tmp32, buffer); safe_unpack32_array(&job_resrcs->cpu_array_reps, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpu_array_reps); job_resrcs->cpu_array_cnt = tmp32; safe_unpack16_array(&job_resrcs->cpu_array_value, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpu_array_value); if (tmp32 != job_resrcs->cpu_array_cnt) goto unpack_error; safe_unpack16_array(&job_resrcs->cpus, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpus); if (tmp32 != job_resrcs->nhosts) goto unpack_error; safe_unpack16_array(&job_resrcs->cpus_used, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cpus_used); safe_unpack32_array(&job_resrcs->memory_allocated, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->memory_allocated); safe_unpack32_array(&job_resrcs->memory_used, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->memory_used); if (!(cluster_flags & CLUSTER_FLAG_BG)) { safe_unpack16_array(&job_resrcs->sockets_per_node, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->sockets_per_node); safe_unpack16_array(&job_resrcs->cores_per_socket, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->cores_per_socket); safe_unpack32_array(&job_resrcs->sock_core_rep_count, &tmp32, buffer); if (tmp32 == 0) xfree(job_resrcs->sock_core_rep_count); unpack_bit_str(&job_resrcs->core_bitmap, buffer); unpack_bit_str(&job_resrcs->core_bitmap_used, buffer); } } else { error("unpack_job_resources: protocol_version %hu not " "supported", protocol_version); goto unpack_error; } *job_resrcs_pptr = job_resrcs; return SLURM_SUCCESS; unpack_error: error("unpack_job_resources: unpack error"); free_job_resources(&job_resrcs); xfree(bit_fmt); *job_resrcs_pptr = NULL; return SLURM_ERROR; }
/* cr_job_test - does most of the real work for select_p_job_test(), which * includes contiguous selection, load-leveling and max_share logic * * PROCEDURE: * * Step 1: compare nodes in "avail" bitmap with current node state data * to find available nodes that match the job request * * Step 2: check resources in "avail" bitmap with allocated resources from * higher priority partitions (busy resources are UNavailable) * * Step 3: select resource usage on remaining resources in "avail" bitmap * for this job, with the placement influenced by existing * allocations */ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, struct node_use_record *node_usage) { static int gang_mode = -1; int error_code = SLURM_SUCCESS; bitstr_t *orig_map, *avail_cores, *free_cores; bitstr_t *tmpcore = NULL; bool test_only; uint32_t c, i, j, k, n, csize, save_mem = 0; job_resources_t *job_res; struct job_details *details_ptr; struct part_res_record *p_ptr, *jp_ptr; uint16_t *cpu_count; if (gang_mode == -1) { if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) gang_mode = 1; else gang_mode = 0; } details_ptr = job_ptr->details; free_job_resources(&job_ptr->job_resrcs); if (mode == SELECT_MODE_TEST_ONLY) test_only = true; else /* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN */ test_only = false; /* check node_state and update the node bitmap as necessary */ if (!test_only) { error_code = _verify_node_state(cr_part_ptr, job_ptr, bitmap, cr_type, node_usage, job_node_req); if (error_code != SLURM_SUCCESS) return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: evaluating job %u on %u nodes", job_ptr->job_id, bit_set_count(bitmap)); } orig_map = bit_copy(bitmap); avail_cores = _make_core_bitmap(bitmap); /* test to make sure that this job can succeed with all avail_cores * if 'no' then return FAIL * if 'yes' then we will seek the optimal placement for this job * within avail_cores */ free_cores = bit_copy(avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count == NULL) { /* job cannot fit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 fail: " "insufficient resources"); } return SLURM_ERROR; } else if (test_only) { FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("select/serial: cr_job_test: test 0 pass: "******"test_only"); return SLURM_SUCCESS; } if (cr_type == CR_MEMORY) { /* CR_MEMORY does not care about existing CPU allocations, * so we can jump right to job allocation from here */ goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 pass - " "job fits on given resources"); } /* now that we know that this job can run with the given resources, * let's factor in the existing allocations and seek the optimal set * of resources for this job. Here is the procedure: * * Step 1: Seek idle CPUs across all partitions. If successful then * place job and exit. If not successful, then continue. Two * related items to note: * 1. Jobs that don't share CPUs finish with step 1. * 2. The remaining steps assume sharing or preemption. * * Step 2: Remove resources that are in use by higher-priority * partitions, and test that job can still succeed. If not * then exit. * * Step 3: Seek idle nodes among the partitions with the same * priority as the job's partition. If successful then * goto Step 6. If not then continue: * * Step 4: Seek placement within the job's partition. Search * row-by-row. If no placement if found, then exit. If a row * is found, then continue: * * Step 5: Place job and exit. FIXME! Here is where we need a * placement algorithm that recognizes existing job * boundaries and tries to "overlap jobs" as efficiently * as possible. * * Step 6: Place job and exit. FIXME! here is we use a placement * algorithm similar to Step 5 on jobs from lower-priority * partitions. */ /*** Step 1 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove all existing allocations from free_cores */ tmpcore = bit_copy(free_cores); for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 pass - " "idle resources found"); } goto alloc_job; } if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) { /* This job CANNOT share CPUs regardless of priority, * so we fail here. Note that Shared=EXCLUSIVE was already * addressed in _verify_node_state() and job preemption * removes jobs from simulated resource allocation map * before this point. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "no idle resources available"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "not enough idle resources"); } /*** Step 2 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; } if (!jp_ptr) { fatal("select/serial: could not find partition for job %u", job_ptr->job_id); return SLURM_ERROR; /* Fix CLANG false positive */ } /* remove existing allocations (jobs) from higher-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if ((p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) && (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF)) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } /* make these changes permanent */ bit_copybits(avail_cores, free_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (!cpu_count) { /* job needs resources that are currently in use by * higher-priority jobs, so fail for now */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 fail - " "resources busy with higher priority jobs"); } goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 pass - " "available resources for this priority"); } /*** Step 3 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove existing allocations (jobs) from same-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* jobs from low-priority partitions are the only thing left * in our way. for now we'll ignore them, but FIXME: we need * a good placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and the low-priority * jobs */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 pass - " "found resources"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 fail - " "not enough idle resources in same priority"); } /*** Step 4 ***/ /* try to fit the job into an existing row * * tmpcore = worker core_bitmap * free_cores = core_bitmap to be built * avail_cores = static core_bitmap of all available cores */ if (!jp_ptr || !jp_ptr->row) { /* there's no existing jobs in this partition, so place * the job in avail_cores. FIXME: still need a good * placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and existing * jobs in the other partitions with <= priority to * this partition */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 pass - " "first row found"); } goto alloc_job; } cr_sort_part_rows(jp_ptr); c = jp_ptr->num_rows; if (job_node_req != NODE_CR_AVAILABLE) c = 1; for (i = 0; i < c; i++) { if (!jp_ptr->row[i].row_bitmap) break; bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap); bit_not(tmpcore); bit_and(free_cores, tmpcore); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 pass - row %i", i); } break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 fail - row %i", i); } } if ((i < c) && !jp_ptr->row[i].row_bitmap) { /* we've found an empty row, so use it */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 trying empty row %i",i); } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); } if (!cpu_count) { /* job can't fit into any row, so exit */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 fail - " "busy partition"); } goto alloc_job; } /*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 *** * Note that while the job may have fit into a row, it should * still be run through a good placement algorithm here that * optimizes "job overlap" between this job (in these idle nodes) * and existing jobs in the other partitions with <= priority to * this partition */ alloc_job: /* at this point we've found a good set of * bits to allocate to this job: * - bitmap is the set of nodes to allocate * - free_cores is the set of allocated cores * - cpu_count is the number of cpus per allocated node * * Next steps are to cleanup the worker variables, * create the job_resources struct, * distribute the job on the bits, and exit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(avail_cores); FREE_NULL_BITMAP(tmpcore); if (!cpu_count) { /* we were sent here to cleanup and exit */ FREE_NULL_BITMAP(free_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: exiting cr_job_test with no " "allocation"); } return SLURM_ERROR; } /* At this point we have: * - a bitmap of selected nodes * - a free_cores bitmap of usable cores on each selected node * - a per-alloc-node cpu_count array */ if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL)) error_code = EINVAL; if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) job_ptr->total_cpus = 1; if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } n = bit_ffs(bitmap); if (n < 0) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: distributing job %u", job_ptr->job_id); } /** create the struct_job_res **/ job_res = create_job_resources(); job_res->node_bitmap = bit_copy(bitmap); job_res->nodes = bitmap2node_name(bitmap); job_res->nhosts = bit_set_count(bitmap); job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->ncpus *= details_ptr->ntasks_per_node; job_res->ncpus = MAX(job_res->ncpus, details_ptr->min_cpus); job_res->ncpus = MAX(job_res->ncpus, details_ptr->pn_min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->memory_allocated = xmalloc(job_res->nhosts * sizeof(uint32_t)); job_res->memory_used = xmalloc(job_res->nhosts * sizeof(uint32_t)); /* store the hardware data for the selected nodes */ error_code = build_job_resources(job_res, node_record_table_ptr, select_fast_schedule); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return error_code; } c = 0; csize = bit_size(job_res->core_bitmap); j = cr_get_coremap_offset(n); k = cr_get_coremap_offset(n + 1); for (; j < k; j++, c++) { if (!bit_test(free_cores, j)) continue; if (c >= csize) { error("select/serial: cr_job_test " "core_bitmap index error on node %s", select_node_record[n].node_ptr->name); drain_nodes(select_node_record[n].node_ptr->name, "Bad core count", getuid()); free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return SLURM_ERROR; } bit_set(job_res->core_bitmap, c); break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d " "nbits %u", job_ptr->job_id, job_res->ncpus, bit_set_count(free_cores), 1, job_res->nhosts); } FREE_NULL_BITMAP(free_cores); /* distribute the tasks and clear any unused cores */ job_ptr->job_resrcs = job_res; error_code = cr_dist(job_ptr, cr_type); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_ptr->job_resrcs); return error_code; } /* translate job_res->cpus array into format with rep count */ job_ptr->total_cpus = build_job_resources_cpu_array(job_res); if (!(cr_type & CR_MEMORY)) return error_code; /* load memory allocated array */ save_mem = details_ptr->pn_min_memory; if (save_mem & MEM_PER_CPU) { /* memory is per-cpu */ save_mem &= (~MEM_PER_CPU); job_res->memory_allocated[0] = job_res->cpus[0] * save_mem; } else { /* memory is per-node */ job_res->memory_allocated[0] = save_mem; } return error_code; }