static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int sched_timeout = 2, yield_sleep = 1; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; #ifdef HAVE_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); sched_start = now = time(NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true); if (list_count(job_queue) == 0) { debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; bf_last_yields = 0; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt + 3)); node_space[0].begin_time = sched_start; node_space[0].end_time = sched_start + backfill_window; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } while ((job_queue_rec = (job_queue_rec_t *) list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; orig_time_limit = job_ptr->time_limit; if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 0; START_TIMER; } part_ptr = job_queue_rec->part_ptr; job_test_count++; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != (uint16_t) NO_VAL) { if (reject_array_job_id == job_ptr->array_job_id) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); slurmctld_diag_stats.bf_last_depth++; already_counted = false; if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else { error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] > max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) continue; if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) continue; if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) continue; /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partition's max_nodes */ continue; } /* Determine job's expected completion time */ if (job_ptr->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) time_limit = 365 * 24 * 60; /* one year */ else time_limit = part_ptr->max_time; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_ptr->max_time); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks 2" "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if ((resv_end++) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { uint32_t save_time_limit = job_ptr->time_limit; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) job_ptr->time_limit = comp_time_limit; else job_ptr->time_limit = orig_time_limit; job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ job_ptr->time_limit = comp_time_limit; job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } else { job_ptr->time_limit = orig_time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { /* Planned to start job, but something bad * happended. */ job_ptr->start_time = 0; break; } else { /* Started this job, move to next one */ reject_array_job_id = 0; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); continue; } } else job_ptr->time_limit = orig_time_limit; if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ continue; } if (node_space_recs >= max_backfill_job_cnt) { /* Already have too many jobs to deal with */ break; } end_reserve = job_ptr->start_time + (time_limit * 60); if (_test_resv_overlap(node_space, avail_bitmap, job_ptr->start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; bit_not(avail_bitmap); _add_reservation(job_ptr->start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); } xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %d jobs, %s", job_test_count, TIME_STR); } return rc; }
/* Attempt to schedule a specific job on specific available nodes * IN job_ptr - job to schedule * IN/OUT avail_bitmap - nodes available/selected to use * IN exc_core_bitmap - cores which can not be used * RET SLURM_SUCCESS on success, otherwise an error code */ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *exc_core_bitmap) { bitstr_t *tmp_bitmap; int rc = SLURM_SUCCESS; int feat_cnt = _num_feature_count(job_ptr); List preemptee_candidates = NULL; if (feat_cnt) { /* Ideally schedule the job feature by feature, * but I don't want to add that complexity here * right now, so clear the feature counts and try * to schedule. This will work if there is only * one feature count. It should work fairly well * in cases where there are multiple feature * counts. */ struct job_details *detail_ptr = job_ptr->details; ListIterator feat_iter; struct feature_record *feat_ptr; int i = 0, list_size; uint16_t *feat_cnt_orig = NULL, high_cnt = 0; /* Clear the feature counts */ list_size = list_count(detail_ptr->feature_list); feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size); feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { high_cnt = MAX(high_cnt, feat_ptr->count); feat_cnt_orig[i++] = feat_ptr->count; feat_ptr->count = 0; } list_iterator_destroy(feat_iter); if ((job_req_node_filter(job_ptr, *avail_bitmap) != SLURM_SUCCESS) || (bit_set_count(*avail_bitmap) < high_cnt)) { rc = ESLURM_NODES_BUSY; } else { preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); rc = select_g_job_test(job_ptr, *avail_bitmap, high_cnt, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } /* Restore the feature counts */ i = 0; feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { feat_ptr->count = feat_cnt_orig[i++]; } list_iterator_destroy(feat_iter); xfree(feat_cnt_orig); } else { /* Try to schedule the job. First on dedicated nodes * then on shared nodes (if so configured). */ uint16_t orig_shared; time_t now = time(NULL); char str[100]; preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_shared = job_ptr->details->shared; job_ptr->details->shared = 0; tmp_bitmap = bit_copy(*avail_bitmap); if (exc_core_bitmap) { bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap); debug2(" _try_sched with exclude core bitmap: %s",str); } rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); job_ptr->details->shared = orig_shared; if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap= tmp_bitmap; rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } else FREE_NULL_BITMAP(tmp_bitmap); } if (preemptee_candidates) list_destroy(preemptee_candidates); return rc; }
static char * _will_run_test2(uint32_t jobid, time_t start_time, char *node_list, uint32_t *preemptee, int preemptee_cnt, int *err_code, char **err_msg) { struct job_record *job_ptr = NULL, *pre_ptr; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; time_t start_res; uint32_t min_nodes, max_nodes, req_nodes; List preemptee_candidates = NULL, preempted_jobs = NULL; time_t orig_start_time; char *reply_msg = NULL; int i, rc; bool resv_overlap = false; xassert(node_list); debug2("wiki2: will_run2 job_id=%u start_time=%u node_list=%s", jobid, (uint32_t)start_time, node_list); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); return NULL; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "WillRun not applicable to non-pending job"; error("wiki: WillRun on non-pending job %u", jobid); return NULL; } part_ptr = job_ptr->part_ptr; if (part_ptr == NULL) { *err_code = -700; *err_msg = "Job lacks a partition"; error("wiki: Job %u lacks a partition", jobid); return NULL; } if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid available nodes value"; error("wiki: Attempt to set invalid available node " "list for job %u, %s", jobid, node_list); return NULL; } /* Enforce reservation: access control, time and nodes */ start_res = start_time; rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap, &resv_overlap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; error("wiki: reservation access denied for job %u", jobid); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } bit_and(avail_bitmap, resv_bitmap); FREE_NULL_BITMAP(resv_bitmap); /* Only consider nodes that are not DOWN or DRAINED */ bit_and(avail_bitmap, avail_node_bitmap); /* Consider only nodes in this job's partition */ if (part_ptr->node_bitmap) bit_and(avail_bitmap, part_ptr->node_bitmap); else { *err_code = -730; *err_msg = "Job's partition has no nodes"; error("wiki: no nodes in partition %s for job %u", part_ptr->name, jobid); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) { /* Job probably has invalid feature list */ *err_code = -730; *err_msg = "Job's required features not available " "on selected nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) { *err_code = -730; *err_msg = "Job's required nodes not available"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partitions max_nodes */ *err_code = -730; *err_msg = "Job's min_nodes > max_nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (preemptee_cnt) { preemptee_candidates = list_create(NULL); for (i=0; i<preemptee_cnt; i++) { if ((pre_ptr = find_job_record(preemptee[i]))) list_append(preemptee_candidates, pre_ptr); } } orig_start_time = job_ptr->start_time; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, &preempted_jobs, exc_core_bitmap); FREE_NULL_LIST(preemptee_candidates); if (rc == SLURM_SUCCESS) { char *hostlist, *sep, tmp_str[128]; uint32_t pre_cnt = 0, proc_cnt = 0; #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &proc_cnt); #else proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "STARTINFO=%u TASKS=%u STARTTIME=%u NODES=", job_ptr->job_id, proc_cnt, (uint32_t) job_ptr->start_time); xstrcat(reply_msg, tmp_str); hostlist = bitmap2node_name(avail_bitmap); xstrcat(reply_msg, hostlist); xfree(hostlist); if (preempted_jobs) { while ((pre_ptr = list_pop(preempted_jobs))) { if (pre_cnt++) sep = ","; else sep = " PREEMPT="; snprintf(tmp_str, sizeof(tmp_str), "%s%u", sep, pre_ptr->job_id); xstrcat(reply_msg, tmp_str); } FREE_NULL_LIST(preempted_jobs); } } else { xstrcat(reply_msg, "Jobs not runable on selected nodes"); error("wiki: jobs not runnable on nodes"); } /* Restore pending job's expected start time */ job_ptr->start_time = orig_start_time; FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return reply_msg; }
static char * _will_run_test(uint32_t jobid, time_t start_time, char *node_list, int *err_code, char **err_msg) { struct job_record *job_ptr = NULL; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; char *hostlist, *reply_msg = NULL; uint32_t min_nodes, max_nodes, req_nodes; int rc; time_t start_res, orig_start_time; List preemptee_candidates; debug2("wiki2: will_run job_id=%u start_time=%u node_list=%s", jobid, (uint32_t)start_time, node_list); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); return NULL; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "WillRun not applicable to non-pending job"; error("wiki: WillRun on non-pending job %u", jobid); return NULL; } part_ptr = job_ptr->part_ptr; if (part_ptr == NULL) { *err_code = -700; *err_msg = "Job lacks a partition"; error("wiki: Job %u lacks a partition", jobid); return NULL; } if ((node_list == NULL) || (node_list[0] == '\0')) { /* assume all nodes available to job for testing */ avail_bitmap = bit_copy(avail_node_bitmap); } else if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid available nodes value"; error("wiki: Attempt to set invalid available node " "list for job %u, %s", jobid, node_list); return NULL; } /* Enforce reservation: access control, time and nodes */ start_res = start_time; rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; error("wiki: reservation access denied for job %u", jobid); FREE_NULL_BITMAP(avail_bitmap); return NULL; } start_time = MAX(start_time, start_res); bit_and(avail_bitmap, resv_bitmap); FREE_NULL_BITMAP(resv_bitmap); /* Only consider nodes that are not DOWN or DRAINED */ bit_and(avail_bitmap, avail_node_bitmap); /* Consider only nodes in this job's partition */ if (part_ptr->node_bitmap) bit_and(avail_bitmap, part_ptr->node_bitmap); else { *err_code = -730; *err_msg = "Job's partition has no nodes"; error("wiki: no nodes in partition %s for job %u", part_ptr->name, jobid); FREE_NULL_BITMAP(avail_bitmap); return NULL; } if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) { /* Job probably has invalid feature list */ *err_code = -730; *err_msg = "Job's required features not available " "on selected nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); return NULL; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) { *err_code = -730; *err_msg = "Job's required nodes not available"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); return NULL; } min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partitions max_nodes */ *err_code = -730; *err_msg = "Job's min_nodes > max_nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); return NULL; } preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_start_time = job_ptr->start_time; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); if (preemptee_candidates) list_destroy(preemptee_candidates); if (rc == SLURM_SUCCESS) { char tmp_str[128]; *err_code = 0; uint32_t proc_cnt = 0; xstrcat(reply_msg, "STARTINFO="); #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &proc_cnt); #else proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,", jobid, proc_cnt, (uint32_t) job_ptr->start_time); xstrcat(reply_msg, tmp_str); hostlist = bitmap2node_name(avail_bitmap); xstrcat(reply_msg, hostlist); xfree(hostlist); } else { xstrcat(reply_msg, "Jobs not runable on selected nodes"); error("wiki: jobs not runnable on nodes"); } /* Restore pending job's expected start time */ job_ptr->start_time = orig_start_time; FREE_NULL_BITMAP(avail_bitmap); return reply_msg; }
static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr, **bf_part_ptr = NULL; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end, window_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; struct part_record *reject_array_part = NULL; uint32_t job_start_cnt = 0, start_time; time_t config_update = slurmctld_conf.last_update; time_t part_update = last_part_update; struct timeval start_tv; bf_last_yields = 0; #ifdef HAVE_ALPS_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1000000); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); else debug("backfill: beginning"); sched_start = now = time(NULL); gettimeofday(&start_tv, NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true, true); if (list_count(job_queue) == 0) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: no jobs to backfill"); else debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); non_cg_bitmap = bit_copy(cg_node_bitmap); bit_not(non_cg_bitmap); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt * 2 + 1)); node_space[0].begin_time = sched_start; window_end = sched_start + backfill_window; node_space[0].end_time = window_end; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); if (max_backfill_job_per_part) { ListIterator part_iterator; struct part_record *part_ptr; bf_parts = list_count(part_list); bf_part_ptr = xmalloc(sizeof(struct part_record *) * bf_parts); bf_part_jobs = xmalloc(sizeof(int) * bf_parts); part_iterator = list_iterator_create(part_list); i = 0; while ((part_ptr = (struct part_record *) list_next(part_iterator))) { bf_part_ptr[i++] = part_ptr; } list_iterator_destroy(part_iterator); } if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } sort_job_queue(job_queue); while (1) { job_queue_rec = (job_queue_rec_t *) list_pop(job_queue); if (!job_queue_rec) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: reached end of job queue"); break; } if (slurmctld_config.shutdown_time) break; if (((defer_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= defer_rpc_cnt)) || (_delta_tv(&start_tv) >= sched_timeout)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; xfree(job_queue_rec); break; } /* cg_node_bitmap may be changed */ bit_copybits(non_cg_bitmap, cg_node_bitmap); bit_not(non_cg_bitmap); /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 0; START_TIMER; } job_ptr = job_queue_rec->job_ptr; /* With bf_continue configured, the original job could have * been cancelled and purged. Validate pointer here. */ if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->job_id != job_queue_rec->job_id)) { xfree(job_queue_rec); continue; } orig_time_limit = job_ptr->time_limit; part_ptr = job_queue_rec->part_ptr; job_test_count++; slurmctld_diag_stats.bf_last_depth++; already_counted = false; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != NO_VAL) { if ((reject_array_job_id == job_ptr->array_job_id) && (reject_array_part == part_ptr)) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; reject_array_part = part_ptr; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill test for JobID=%u Prio=%u Partition=%s", job_ptr->job_id, job_ptr->priority, job_ptr->part_ptr->name); } if (max_backfill_job_per_part) { bool skip_job = false; for (j = 0; j < bf_parts; j++) { if (bf_part_ptr[j] != job_ptr->part_ptr) continue; if (bf_part_jobs[j]++ >= max_backfill_job_per_part) skip_job = true; break; } if (skip_job) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " "checked %u jobs for " "partition %s; skipping " "job %u", max_backfill_job_per_part, job_ptr->part_ptr->name, job_ptr->job_id); continue; } } if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ static bool bf_max_user_msg = true; if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else if (bf_max_user_msg) { bf_max_user_msg = false; error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] >= max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL) || ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: partition %s not usable", job_ptr->part_ptr->name); continue; } if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u not runable now", job_ptr->job_id); continue; } /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u node count too high", job_ptr->job_id); continue; } /* Determine job's expected completion time */ if (part_ptr->max_time == INFINITE) part_time_limit = 365 * 24 * 60; /* one year */ else part_time_limit = part_ptr->max_time; if (job_ptr->time_limit == NO_VAL) { time_limit = part_time_limit; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_time_limit); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if (slurmctld_config.shutdown_time) break; if (((defer_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= defer_rpc_cnt)) || (_delta_tv(&start_tv) >= sched_timeout)) { uint32_t save_job_id = job_ptr->job_id; uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; break; } /* cg_node_bitmap may be changed */ bit_copybits(non_cg_bitmap, cg_node_bitmap); bit_not(non_cg_bitmap); /* With bf_continue configured, the original job could * have been scheduled or cancelled and purged. * Revalidate job the record here. */ if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->job_id != save_job_id)) continue; if (!IS_JOB_PENDING(job_ptr)) continue; if (!avail_front_end(job_ptr)) continue; /* No available frontend */ job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u reservation defer", job_ptr->job_id); job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); bit_and(avail_bitmap, non_cg_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features OR * no change since previously tested nodes (only changes * in other partition nodes) */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_job_test(job_ptr, avail_bitmap, start_res); j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; uint32_t hard_limit; bool reset_time = false; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) { acct_policy_alter_job( job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; } else { acct_policy_alter_job( job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; reset_time = true; } else if (orig_time_limit == NO_VAL) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; } else { acct_policy_alter_job(job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } if (job_ptr->time_limit == INFINITE) hard_limit = 365 * 24 * 60; /* one year */ else hard_limit = job_ptr->time_limit; job_ptr->end_time = job_ptr->start_time + (hard_limit * 60); if (reset_time) { _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: planned start of job %u" " failed: %s", job_ptr->job_id, slurm_strerror(rc)); } /* Drop through and reserve these resources. * Likely due to state changes during sleep. * Make best-effort based upon original state */ job_ptr->time_limit = orig_time_limit; later_start = 0; } else { /* Started this job, move to next one */ reject_array_job_id = 0; reject_array_part = NULL; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); job_start_cnt++; if (max_backfill_jobs_start && (job_start_cnt >= max_backfill_jobs_start)){ if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: bf_max_job_start" " limit of %d reached", max_backfill_jobs_start); } break; } continue; } } else { job_ptr->time_limit = orig_time_limit; } start_time = job_ptr->start_time; end_reserve = job_ptr->start_time + (time_limit * 60); start_time = (start_time / backfill_resolution) * backfill_resolution; end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; if (later_start && (start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); continue; } if (node_space_recs >= max_backfill_job_cnt) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: table size limit of %u reached", max_backfill_job_cnt); } break; } if ((job_ptr->start_time > now) && _test_resv_overlap(node_space, avail_bitmap, start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; reject_array_part = NULL; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); xfree(job_ptr->sched_nodes); job_ptr->sched_nodes = bitmap2node_name(avail_bitmap); bit_not(avail_bitmap); _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); } xfree(bf_part_jobs); xfree(bf_part_ptr); xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); FREE_NULL_BITMAP(non_cg_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } return rc; }
static int _attempt_backfill(void) { bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; time_t now = time(NULL), sched_start, later_start, start_res; node_space_map_t *node_space; static int sched_timeout = 0; int this_sched_timeout = 0, rc = 0; sched_start = now; if (sched_timeout == 0) { sched_timeout = slurm_get_msg_timeout() / 2; sched_timeout = MAX(sched_timeout, 1); sched_timeout = MIN(sched_timeout, 10); } this_sched_timeout = sched_timeout; #ifdef HAVE_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } #endif if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true); if (list_count(job_queue) <= 1) { debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt + 3)); node_space[0].begin_time = sched_start; node_space[0].end_time = sched_start + backfill_window; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); while ((job_queue_rec = (job_queue_rec_t *) list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; part_ptr = job_queue_rec->part_ptr; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) || (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) || (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT) || (job_ptr->state_reason == WAIT_QOS_JOB_LIMIT) || (job_ptr->state_reason == WAIT_QOS_RESOURCE_LIMIT) || (job_ptr->state_reason == WAIT_QOS_TIME_LIMIT) || !acct_policy_job_runnable(job_ptr)) { debug2("backfill: job %u is not allowed to run now. " "Skipping it. State=%s. Reason=%s. Priority=%u", job_ptr->job_id, job_state_string(job_ptr->job_state), job_reason_string(job_ptr->state_reason), job_ptr->priority); continue; } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) continue; if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) continue; if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) continue; /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partition's max_nodes */ continue; } /* Determine job's expected completion time */ if (job_ptr->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) time_limit = 365 * 24 * 60; /* one year */ else time_limit = part_ptr->max_time; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_ptr->max_time); } comp_time_limit = time_limit; orig_time_limit = job_ptr->time_limit; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: FREE_NULL_BITMAP(avail_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } job_ptr->time_limit = orig_time_limit; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); if ((time(NULL) - sched_start) >= this_sched_timeout) { debug("backfill: loop taking too long, yielding locks"); if (_yield_locks()) { debug("backfill: system state changed, " "breaking out"); rc = 1; break; } else { this_sched_timeout += sched_timeout; } } /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes); debug2("backfill: finished _try_sched for job %u.", job_ptr->job_id); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) job_ptr->time_limit = orig_time_limit; else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ job_ptr->time_limit = comp_time_limit; job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } else { job_ptr->time_limit = orig_time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { /* Planned to start job, but something bad * happended. */ job_ptr->start_time = 0; break; } else { /* Started this job, move to next one */ continue; } } else job_ptr->time_limit = orig_time_limit; if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ continue; } if (node_space_recs >= max_backfill_job_cnt) { /* Already have too many jobs to deal with */ break; } end_reserve = job_ptr->start_time + (time_limit * 60); if (_test_resv_overlap(node_space, avail_bitmap, job_ptr->start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; bit_not(avail_bitmap); _add_reservation(job_ptr->start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(resv_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); return rc; }