extern uint16_t job_preempt_mode(struct job_record *job_ptr) { if (job_ptr->qos_ptr && job_ptr->qos_ptr->preempt_mode) return job_ptr->qos_ptr->preempt_mode; return (slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG)); }
extern uint16_t job_preempt_mode(struct job_record *job_ptr) { if (job_ptr->part_ptr && (job_ptr->part_ptr->preempt_mode != (uint16_t) NO_VAL)) return job_ptr->part_ptr->preempt_mode; return (slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG)); }
/* *********************************************************************** */ void slurm_sched_partition_change( void ) { if ( slurm_sched_init() < 0 ) return; if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_reconfig() != SLURM_SUCCESS)) error( "cannot reconfigure gang scheduler" ); (*(g_sched_context->ops.partition_change))(); }
/* *********************************************************************** */ extern int slurm_sched_reconfig( void ) { if ( slurm_sched_init() < 0 ) return SLURM_ERROR; if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_reconfig() != SLURM_SUCCESS)) error( "cannot reconfigure gang scheduler" ); return (*(g_sched_context->ops.reconfig))(); }
extern uint16_t job_preempt_mode(struct job_record *job_ptr) { struct part_record *part_ptr = job_ptr->part_ptr; if (part_ptr && (part_ptr->preempt_mode != (uint16_t) NO_VAL)) { if (part_ptr->preempt_mode & PREEMPT_MODE_GANG) verbose("Partition '%s' preempt mode 'gang' has no " "sense. Filtered out.\n", part_ptr->name); return (part_ptr->preempt_mode & (~PREEMPT_MODE_GANG)); } return (slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG)); }
/* *********************************************************************** */ int slurm_sched_g_newalloc( struct job_record *job_ptr ) { if ( slurm_sched_init() < 0 ) return SLURM_ERROR; if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_start( job_ptr ) != SLURM_SUCCESS)) { error( "gang scheduler problem starting job %u", job_ptr->job_id); } return (*(ops.newalloc))( job_ptr ); }
/* *********************************************************************** */ int slurm_sched_freealloc( struct job_record *job_ptr ) { if ( slurm_sched_init() < 0 ) return SLURM_ERROR; if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_fini( job_ptr ) != SLURM_SUCCESS)) { error( "gang scheduler problem finishing job %u", job_ptr->job_id); } return (*(g_sched_context->ops.freealloc))( job_ptr ); }
/* *********************************************************************** */ int slurm_sched_schedule( void ) { if ( slurm_sched_init() < 0 ) return SLURM_ERROR; #if 0 /* Must have job write lock and node read lock set here */ if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_job_scan() != SLURM_SUCCESS)) error( "gang scheduler could not rescan jobs" ); #endif return (*(g_sched_context->ops.schedule))(); }
int _print_preempt_mode(sinfo_data_t * sinfo_data, int width, bool right_justify, char *suffix) { if (sinfo_data) { uint16_t preempt_mode = sinfo_data->part_info->preempt_mode; if (preempt_mode == (uint16_t) NO_VAL) preempt_mode = slurm_get_preempt_mode(); _print_str(preempt_mode_string(preempt_mode), width, right_justify, true); } else _print_str("PREEMPT_MODE", width, right_justify, true); if (suffix) printf("%s", suffix); return SLURM_SUCCESS; }
/* *********************************************************************** */ extern int slurm_sched_fini( void ) { int rc; if (!g_sched_context) return SLURM_SUCCESS; rc = slurm_sched_context_destroy(g_sched_context); g_sched_context = NULL; if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_fini() != SLURM_SUCCESS)) error( "cannot stop gang scheduler" ); return rc; }
extern uint16_t job_preempt_mode(struct job_record *job_ptr) { uint16_t mode; if (job_ptr->qos_ptr && ((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->preempt_mode) { mode = ((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->preempt_mode; if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) { info("%s: in job_preempt_mode return = %s", plugin_type, preempt_mode_string(mode)); } return mode; } mode = slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG); if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) { info("%s: in job_preempt_mode return = %s", plugin_type, preempt_mode_string(mode)); } return mode; }
/* *********************************************************************** */ extern int slurm_sched_init( void ) { int retval = SLURM_SUCCESS; char *sched_type = NULL; slurm_mutex_lock( &g_sched_context_lock ); if ( g_sched_context ) goto done; sched_type = slurm_get_sched_type(); g_sched_context = slurm_sched_context_create( sched_type ); if ( g_sched_context == NULL ) { error( "cannot create scheduler context for %s", sched_type ); retval = SLURM_ERROR; goto done; } if ( slurm_sched_get_ops( g_sched_context ) == NULL ) { error( "cannot resolve scheduler plugin operations" ); slurm_sched_context_destroy( g_sched_context ); g_sched_context = NULL; retval = SLURM_ERROR; goto done; } if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) && (gs_init() != SLURM_SUCCESS)) error( "cannot start gang scheduler "); done: slurm_mutex_unlock( &g_sched_context_lock ); xfree(sched_type); return retval; }
static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int sched_timeout = 2, yield_sleep = 1; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; #ifdef HAVE_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); sched_start = now = time(NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true); if (list_count(job_queue) == 0) { debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; bf_last_yields = 0; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt + 3)); node_space[0].begin_time = sched_start; node_space[0].end_time = sched_start + backfill_window; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } while ((job_queue_rec = (job_queue_rec_t *) list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; orig_time_limit = job_ptr->time_limit; if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 0; START_TIMER; } part_ptr = job_queue_rec->part_ptr; job_test_count++; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != (uint16_t) NO_VAL) { if (reject_array_job_id == job_ptr->array_job_id) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); slurmctld_diag_stats.bf_last_depth++; already_counted = false; if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else { error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] > max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) continue; if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) continue; if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) continue; /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partition's max_nodes */ continue; } /* Determine job's expected completion time */ if (job_ptr->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) time_limit = 365 * 24 * 60; /* one year */ else time_limit = part_ptr->max_time; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_ptr->max_time); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks 2" "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if ((resv_end++) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { uint32_t save_time_limit = job_ptr->time_limit; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) job_ptr->time_limit = comp_time_limit; else job_ptr->time_limit = orig_time_limit; job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ job_ptr->time_limit = comp_time_limit; job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } else { job_ptr->time_limit = orig_time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { /* Planned to start job, but something bad * happended. */ job_ptr->start_time = 0; break; } else { /* Started this job, move to next one */ reject_array_job_id = 0; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); continue; } } else job_ptr->time_limit = orig_time_limit; if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ continue; } if (node_space_recs >= max_backfill_job_cnt) { /* Already have too many jobs to deal with */ break; } end_reserve = job_ptr->start_time + (time_limit * 60); if (_test_resv_overlap(node_space, avail_bitmap, job_ptr->start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; bit_not(avail_bitmap); _add_reservation(job_ptr->start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); } xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %d jobs, %s", job_test_count, TIME_STR); } return rc; }
extern bool preemption_enabled(void) { return (slurm_get_preempt_mode() != PREEMPT_MODE_OFF); }
/* cr_job_test - does most of the real work for select_p_job_test(), which * includes contiguous selection, load-leveling and max_share logic * * PROCEDURE: * * Step 1: compare nodes in "avail" bitmap with current node state data * to find available nodes that match the job request * * Step 2: check resources in "avail" bitmap with allocated resources from * higher priority partitions (busy resources are UNavailable) * * Step 3: select resource usage on remaining resources in "avail" bitmap * for this job, with the placement influenced by existing * allocations */ extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode, uint16_t cr_type, enum node_cr_state job_node_req, uint32_t cr_node_cnt, struct part_res_record *cr_part_ptr, struct node_use_record *node_usage) { static int gang_mode = -1; int error_code = SLURM_SUCCESS; bitstr_t *orig_map, *avail_cores, *free_cores; bitstr_t *tmpcore = NULL; bool test_only; uint32_t c, i, j, k, n, csize, save_mem = 0; job_resources_t *job_res; struct job_details *details_ptr; struct part_res_record *p_ptr, *jp_ptr; uint16_t *cpu_count; if (gang_mode == -1) { if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) gang_mode = 1; else gang_mode = 0; } details_ptr = job_ptr->details; free_job_resources(&job_ptr->job_resrcs); if (mode == SELECT_MODE_TEST_ONLY) test_only = true; else /* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN */ test_only = false; /* check node_state and update the node bitmap as necessary */ if (!test_only) { error_code = _verify_node_state(cr_part_ptr, job_ptr, bitmap, cr_type, node_usage, job_node_req); if (error_code != SLURM_SUCCESS) return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: evaluating job %u on %u nodes", job_ptr->job_id, bit_set_count(bitmap)); } orig_map = bit_copy(bitmap); avail_cores = _make_core_bitmap(bitmap); /* test to make sure that this job can succeed with all avail_cores * if 'no' then return FAIL * if 'yes' then we will seek the optimal placement for this job * within avail_cores */ free_cores = bit_copy(avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count == NULL) { /* job cannot fit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 fail: " "insufficient resources"); } return SLURM_ERROR; } else if (test_only) { FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(free_cores); FREE_NULL_BITMAP(avail_cores); xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("select/serial: cr_job_test: test 0 pass: "******"test_only"); return SLURM_SUCCESS; } if (cr_type == CR_MEMORY) { /* CR_MEMORY does not care about existing CPU allocations, * so we can jump right to job allocation from here */ goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 0 pass - " "job fits on given resources"); } /* now that we know that this job can run with the given resources, * let's factor in the existing allocations and seek the optimal set * of resources for this job. Here is the procedure: * * Step 1: Seek idle CPUs across all partitions. If successful then * place job and exit. If not successful, then continue. Two * related items to note: * 1. Jobs that don't share CPUs finish with step 1. * 2. The remaining steps assume sharing or preemption. * * Step 2: Remove resources that are in use by higher-priority * partitions, and test that job can still succeed. If not * then exit. * * Step 3: Seek idle nodes among the partitions with the same * priority as the job's partition. If successful then * goto Step 6. If not then continue: * * Step 4: Seek placement within the job's partition. Search * row-by-row. If no placement if found, then exit. If a row * is found, then continue: * * Step 5: Place job and exit. FIXME! Here is where we need a * placement algorithm that recognizes existing job * boundaries and tries to "overlap jobs" as efficiently * as possible. * * Step 6: Place job and exit. FIXME! here is we use a placement * algorithm similar to Step 5 on jobs from lower-priority * partitions. */ /*** Step 1 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove all existing allocations from free_cores */ tmpcore = bit_copy(free_cores); for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* job fits! We're done. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 pass - " "idle resources found"); } goto alloc_job; } if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) { /* This job CANNOT share CPUs regardless of priority, * so we fail here. Note that Shared=EXCLUSIVE was already * addressed in _verify_node_state() and job preemption * removes jobs from simulated resource allocation map * before this point. */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "no idle resources available"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 1 fail - " "not enough idle resources"); } /*** Step 2 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) { if (jp_ptr->part_ptr == job_ptr->part_ptr) break; } if (!jp_ptr) { fatal("select/serial: could not find partition for job %u", job_ptr->job_id); return SLURM_ERROR; /* Fix CLANG false positive */ } /* remove existing allocations (jobs) from higher-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if ((p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) && (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF)) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } /* make these changes permanent */ bit_copybits(avail_cores, free_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (!cpu_count) { /* job needs resources that are currently in use by * higher-priority jobs, so fail for now */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 fail - " "resources busy with higher priority jobs"); } goto alloc_job; } xfree(cpu_count); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 2 pass - " "available resources for this priority"); } /*** Step 3 ***/ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); /* remove existing allocations (jobs) from same-priority partitions * from avail_cores */ for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) { if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority) continue; if (!p_ptr->row) continue; for (i = 0; i < p_ptr->num_rows; i++) { if (!p_ptr->row[i].row_bitmap) continue; bit_copybits(tmpcore, p_ptr->row[i].row_bitmap); bit_not(tmpcore); /* set bits now "free" resources */ bit_and(free_cores, tmpcore); } } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { /* jobs from low-priority partitions are the only thing left * in our way. for now we'll ignore them, but FIXME: we need * a good placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and the low-priority * jobs */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 pass - " "found resources"); } goto alloc_job; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 3 fail - " "not enough idle resources in same priority"); } /*** Step 4 ***/ /* try to fit the job into an existing row * * tmpcore = worker core_bitmap * free_cores = core_bitmap to be built * avail_cores = static core_bitmap of all available cores */ if (!jp_ptr || !jp_ptr->row) { /* there's no existing jobs in this partition, so place * the job in avail_cores. FIXME: still need a good * placement algorithm here that optimizes "job overlap" * between this job (in these idle nodes) and existing * jobs in the other partitions with <= priority to * this partition */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 pass - " "first row found"); } goto alloc_job; } cr_sort_part_rows(jp_ptr); c = jp_ptr->num_rows; if (job_node_req != NODE_CR_AVAILABLE) c = 1; for (i = 0; i < c; i++) { if (!jp_ptr->row[i].row_bitmap) break; bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap); bit_not(tmpcore); bit_and(free_cores, tmpcore); cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); if (cpu_count) { if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 pass - row %i", i); } break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 fail - row %i", i); } } if ((i < c) && !jp_ptr->row[i].row_bitmap) { /* we've found an empty row, so use it */ bit_copybits(bitmap, orig_map); bit_copybits(free_cores, avail_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: " "test 4 trying empty row %i",i); } cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores, node_usage, cr_type, test_only); } if (!cpu_count) { /* job can't fit into any row, so exit */ if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: test 4 fail - " "busy partition"); } goto alloc_job; } /*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 *** * Note that while the job may have fit into a row, it should * still be run through a good placement algorithm here that * optimizes "job overlap" between this job (in these idle nodes) * and existing jobs in the other partitions with <= priority to * this partition */ alloc_job: /* at this point we've found a good set of * bits to allocate to this job: * - bitmap is the set of nodes to allocate * - free_cores is the set of allocated cores * - cpu_count is the number of cpus per allocated node * * Next steps are to cleanup the worker variables, * create the job_resources struct, * distribute the job on the bits, and exit */ FREE_NULL_BITMAP(orig_map); FREE_NULL_BITMAP(avail_cores); FREE_NULL_BITMAP(tmpcore); if (!cpu_count) { /* we were sent here to cleanup and exit */ FREE_NULL_BITMAP(free_cores); if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: exiting cr_job_test with no " "allocation"); } return SLURM_ERROR; } /* At this point we have: * - a bitmap of selected nodes * - a free_cores bitmap of usable cores on each selected node * - a per-alloc-node cpu_count array */ if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL)) error_code = EINVAL; if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN)) job_ptr->total_cpus = 1; if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } n = bit_ffs(bitmap); if (n < 0) { FREE_NULL_BITMAP(free_cores); xfree(cpu_count); return error_code; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: distributing job %u", job_ptr->job_id); } /** create the struct_job_res **/ job_res = create_job_resources(); job_res->node_bitmap = bit_copy(bitmap); job_res->nodes = bitmap2node_name(bitmap); job_res->nhosts = bit_set_count(bitmap); job_res->ncpus = job_res->nhosts; if (job_ptr->details->ntasks_per_node) job_res->ncpus *= details_ptr->ntasks_per_node; job_res->ncpus = MAX(job_res->ncpus, details_ptr->min_cpus); job_res->ncpus = MAX(job_res->ncpus, details_ptr->pn_min_cpus); job_res->node_req = job_node_req; job_res->cpus = cpu_count; job_res->cpus_used = xmalloc(job_res->nhosts * sizeof(uint16_t)); job_res->memory_allocated = xmalloc(job_res->nhosts * sizeof(uint32_t)); job_res->memory_used = xmalloc(job_res->nhosts * sizeof(uint32_t)); /* store the hardware data for the selected nodes */ error_code = build_job_resources(job_res, node_record_table_ptr, select_fast_schedule); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return error_code; } c = 0; csize = bit_size(job_res->core_bitmap); j = cr_get_coremap_offset(n); k = cr_get_coremap_offset(n + 1); for (; j < k; j++, c++) { if (!bit_test(free_cores, j)) continue; if (c >= csize) { error("select/serial: cr_job_test " "core_bitmap index error on node %s", select_node_record[n].node_ptr->name); drain_nodes(select_node_record[n].node_ptr->name, "Bad core count", getuid()); free_job_resources(&job_res); FREE_NULL_BITMAP(free_cores); return SLURM_ERROR; } bit_set(job_res->core_bitmap, c); break; } if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) { info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d " "nbits %u", job_ptr->job_id, job_res->ncpus, bit_set_count(free_cores), 1, job_res->nhosts); } FREE_NULL_BITMAP(free_cores); /* distribute the tasks and clear any unused cores */ job_ptr->job_resrcs = job_res; error_code = cr_dist(job_ptr, cr_type); if (error_code != SLURM_SUCCESS) { free_job_resources(&job_ptr->job_resrcs); return error_code; } /* translate job_res->cpus array into format with rep count */ job_ptr->total_cpus = build_job_resources_cpu_array(job_res); if (!(cr_type & CR_MEMORY)) return error_code; /* load memory allocated array */ save_mem = details_ptr->pn_min_memory; if (save_mem & MEM_PER_CPU) { /* memory is per-cpu */ save_mem &= (~MEM_PER_CPU); job_res->memory_allocated[0] = job_res->cpus[0] * save_mem; } else { /* memory is per-node */ job_res->memory_allocated[0] = save_mem; } return error_code; }
/* * slurm_sprint_partition_info - output information about a specific Slurm * partition based upon message as loaded using slurm_load_partitions * IN part_ptr - an individual partition information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ char *slurm_sprint_partition_info ( partition_info_t * part_ptr, int one_liner ) { char tmp1[16], tmp2[16]; char tmp_line[MAXHOSTRANGELEN]; char *out = NULL; char *allow_deny, *value; uint16_t force, preempt_mode, val; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); /****** Line 1 ******/ snprintf(tmp_line, sizeof(tmp_line), "PartitionName=%s", part_ptr->name); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 2 ******/ if ((part_ptr->allow_groups == NULL) || (part_ptr->allow_groups[0] == '\0')) sprintf(tmp_line, "AllowGroups=ALL"); else { snprintf(tmp_line, sizeof(tmp_line), "AllowGroups=%s", part_ptr->allow_groups); } xstrcat(out, tmp_line); if (part_ptr->allow_accounts || !part_ptr->deny_accounts) { allow_deny = "Allow"; if ((part_ptr->allow_accounts == NULL) || (part_ptr->allow_accounts[0] == '\0')) value = "ALL"; else value = part_ptr->allow_accounts; } else { allow_deny = "Deny"; value = part_ptr->deny_accounts; } snprintf(tmp_line, sizeof(tmp_line), " %sAccounts=%s", allow_deny, value); xstrcat(out, tmp_line); if (part_ptr->allow_qos || !part_ptr->deny_qos) { allow_deny = "Allow"; if ((part_ptr->allow_qos == NULL) || (part_ptr->allow_qos[0] == '\0')) value = "ALL"; else value = part_ptr->allow_qos; } else { allow_deny = "Deny"; value = part_ptr->deny_qos; } snprintf(tmp_line, sizeof(tmp_line), " %sQos=%s", allow_deny, value); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 3 ******/ if (part_ptr->allow_alloc_nodes == NULL) snprintf(tmp_line, sizeof(tmp_line), "AllocNodes=%s","ALL"); else snprintf(tmp_line, sizeof(tmp_line), "AllocNodes=%s", part_ptr->allow_alloc_nodes); xstrcat(out, tmp_line); if (part_ptr->alternate != NULL) { snprintf(tmp_line, sizeof(tmp_line), " Alternate=%s", part_ptr->alternate); xstrcat(out, tmp_line); } if (part_ptr->flags & PART_FLAG_DEFAULT) sprintf(tmp_line, " Default=YES"); else sprintf(tmp_line, " Default=NO"); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 4 added here for BG partitions only ****** to maintain alphabetized output ******/ if (cluster_flags & CLUSTER_FLAG_BG) { snprintf(tmp_line, sizeof(tmp_line), "BasePartitions=%s", part_ptr->nodes); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 5 ******/ if (part_ptr->default_time == INFINITE) sprintf(tmp_line, "DefaultTime=UNLIMITED"); else if (part_ptr->default_time == NO_VAL) sprintf(tmp_line, "DefaultTime=NONE"); else { char time_line[32]; secs2time_str(part_ptr->default_time * 60, time_line, sizeof(time_line)); sprintf(tmp_line, "DefaultTime=%s", time_line); } xstrcat(out, tmp_line); if (part_ptr->flags & PART_FLAG_NO_ROOT) sprintf(tmp_line, " DisableRootJobs=YES"); else sprintf(tmp_line, " DisableRootJobs=NO"); xstrcat(out, tmp_line); sprintf(tmp_line, " GraceTime=%u", part_ptr->grace_time); xstrcat(out, tmp_line); if (part_ptr->flags & PART_FLAG_HIDDEN) sprintf(tmp_line, " Hidden=YES"); else sprintf(tmp_line, " Hidden=NO"); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 6 ******/ if (part_ptr->max_nodes == INFINITE) sprintf(tmp_line, "MaxNodes=UNLIMITED"); else { if (cluster_flags & CLUSTER_FLAG_BG) convert_num_unit((float)part_ptr->max_nodes, tmp1, sizeof(tmp1), UNIT_NONE); else snprintf(tmp1, sizeof(tmp1),"%u", part_ptr->max_nodes); sprintf(tmp_line, "MaxNodes=%s", tmp1); } xstrcat(out, tmp_line); if (part_ptr->max_time == INFINITE) sprintf(tmp_line, " MaxTime=UNLIMITED"); else { char time_line[32]; secs2time_str(part_ptr->max_time * 60, time_line, sizeof(time_line)); sprintf(tmp_line, " MaxTime=%s", time_line); } xstrcat(out, tmp_line); if (cluster_flags & CLUSTER_FLAG_BG) convert_num_unit((float)part_ptr->min_nodes, tmp1, sizeof(tmp1), UNIT_NONE); else snprintf(tmp1, sizeof(tmp1), "%u", part_ptr->min_nodes); sprintf(tmp_line, " MinNodes=%s", tmp1); xstrcat(out, tmp_line); if (part_ptr->max_cpus_per_node == INFINITE) sprintf(tmp_line, " MaxCPUsPerNode=UNLIMITED"); else { sprintf(tmp_line, " MaxCPUsPerNode=%u", part_ptr->max_cpus_per_node); } xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line added here for non BG nodes to keep with alphabetized output******/ if (!(cluster_flags & CLUSTER_FLAG_BG)) { snprintf(tmp_line, sizeof(tmp_line), "Nodes=%s", part_ptr->nodes); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); } /****** Line 7 ******/ sprintf(tmp_line, "Priority=%u", part_ptr->priority); xstrcat(out, tmp_line); if (part_ptr->flags & PART_FLAG_ROOT_ONLY) sprintf(tmp_line, " RootOnly=YES"); else sprintf(tmp_line, " RootOnly=NO"); xstrcat(out, tmp_line); if (part_ptr->flags & PART_FLAG_REQ_RESV) sprintf(tmp_line, " ReqResv=YES"); else sprintf(tmp_line, " ReqResv=NO"); xstrcat(out, tmp_line); force = part_ptr->max_share & SHARED_FORCE; val = part_ptr->max_share & (~SHARED_FORCE); if (val == 0) xstrcat(out, " Shared=EXCLUSIVE"); else if (force) { sprintf(tmp_line, " Shared=FORCE:%u", val); xstrcat(out, tmp_line); } else if (val == 1) xstrcat(out, " Shared=NO"); else { sprintf(tmp_line, " Shared=YES:%u", val); xstrcat(out, tmp_line); } preempt_mode = part_ptr->preempt_mode; if (preempt_mode == (uint16_t) NO_VAL) preempt_mode = slurm_get_preempt_mode(); /* use cluster param */ snprintf(tmp_line, sizeof(tmp_line), " PreemptMode=%s", preempt_mode_string(preempt_mode)); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 8 ******/ if (part_ptr->state_up == PARTITION_UP) sprintf(tmp_line, "State=UP"); else if (part_ptr->state_up == PARTITION_DOWN) sprintf(tmp_line, "State=DOWN"); else if (part_ptr->state_up == PARTITION_INACTIVE) sprintf(tmp_line, "State=INACTIVE"); else if (part_ptr->state_up == PARTITION_DRAIN) sprintf(tmp_line, "State=DRAIN"); else sprintf(tmp_line, "State=UNKNOWN"); xstrcat(out, tmp_line); if (cluster_flags & CLUSTER_FLAG_BG) convert_num_unit((float)part_ptr->total_cpus, tmp1, sizeof(tmp1), UNIT_NONE); else snprintf(tmp1, sizeof(tmp1), "%u", part_ptr->total_cpus); sprintf(tmp_line, " TotalCPUs=%s", tmp1); xstrcat(out, tmp_line); if (cluster_flags & CLUSTER_FLAG_BG) convert_num_unit((float)part_ptr->total_nodes, tmp2, sizeof(tmp2), UNIT_NONE); else snprintf(tmp2, sizeof(tmp2), "%u", part_ptr->total_nodes); sprintf(tmp_line, " TotalNodes=%s", tmp2); xstrcat(out, tmp_line); if (part_ptr->cr_type & CR_CORE) sprintf(tmp_line, " SelectTypeParameters=CR_CORE"); else if (part_ptr->cr_type & CR_SOCKET) sprintf(tmp_line, " SelectTypeParameters=CR_SOCKET"); else sprintf(tmp_line, " SelectTypeParameters=N/A"); xstrcat(out, tmp_line); if (one_liner) xstrcat(out, " "); else xstrcat(out, "\n "); /****** Line 9 ******/ if (part_ptr->def_mem_per_cpu & MEM_PER_CPU) { snprintf(tmp_line, sizeof(tmp_line), "DefMemPerCPU=%u", part_ptr->def_mem_per_cpu & (~MEM_PER_CPU)); xstrcat(out, tmp_line); } else if (part_ptr->def_mem_per_cpu == 0) { xstrcat(out, "DefMemPerNode=UNLIMITED"); } else { snprintf(tmp_line, sizeof(tmp_line), "DefMemPerNode=%u", part_ptr->def_mem_per_cpu); xstrcat(out, tmp_line); } if (part_ptr->max_mem_per_cpu & MEM_PER_CPU) { snprintf(tmp_line, sizeof(tmp_line), " MaxMemPerCPU=%u", part_ptr->max_mem_per_cpu & (~MEM_PER_CPU)); xstrcat(out, tmp_line); } else if (part_ptr->max_mem_per_cpu == 0) { xstrcat(out, " MaxMemPerNode=UNLIMITED"); } else { snprintf(tmp_line, sizeof(tmp_line), " MaxMemPerNode=%u", part_ptr->max_mem_per_cpu); xstrcat(out, tmp_line); } if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
/* * slurm_sprint_partition_info - output information about a specific Slurm * partition based upon message as loaded using slurm_load_partitions * IN part_ptr - an individual partition information record pointer * IN one_liner - print as a single line if true * RET out - char * containing formatted output (must be freed after call) * NULL is returned on failure. */ char *slurm_sprint_partition_info ( partition_info_t * part_ptr, int one_liner ) { char tmp[16]; char *out = NULL; char *allow_deny, *value; uint16_t force, preempt_mode, val; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *line_end = (one_liner) ? " " : "\n "; /****** Line 1 ******/ xstrfmtcat(out, "PartitionName=%s", part_ptr->name); xstrcat(out, line_end); /****** Line 2 ******/ if ((part_ptr->allow_groups == NULL) || (part_ptr->allow_groups[0] == '\0')) xstrcat(out, "AllowGroups=ALL"); else { xstrfmtcat(out, "AllowGroups=%s", part_ptr->allow_groups); } if (part_ptr->allow_accounts || !part_ptr->deny_accounts) { allow_deny = "Allow"; if ((part_ptr->allow_accounts == NULL) || (part_ptr->allow_accounts[0] == '\0')) value = "ALL"; else value = part_ptr->allow_accounts; } else { allow_deny = "Deny"; value = part_ptr->deny_accounts; } xstrfmtcat(out, " %sAccounts=%s", allow_deny, value); if (part_ptr->allow_qos || !part_ptr->deny_qos) { allow_deny = "Allow"; if ((part_ptr->allow_qos == NULL) || (part_ptr->allow_qos[0] == '\0')) value = "ALL"; else value = part_ptr->allow_qos; } else { allow_deny = "Deny"; value = part_ptr->deny_qos; } xstrfmtcat(out, " %sQos=%s", allow_deny, value); xstrcat(out, line_end); /****** Line 3 ******/ if (part_ptr->allow_alloc_nodes == NULL) xstrcat(out, "AllocNodes=ALL"); else xstrfmtcat(out, "AllocNodes=%s", part_ptr->allow_alloc_nodes); if (part_ptr->alternate != NULL) { xstrfmtcat(out, " Alternate=%s", part_ptr->alternate); } if (part_ptr->flags & PART_FLAG_DEFAULT) xstrcat(out, " Default=YES"); else xstrcat(out, " Default=NO"); if (part_ptr->qos_char) xstrfmtcat(out, " QoS=%s", part_ptr->qos_char); else xstrcat(out, " QoS=N/A"); xstrcat(out, line_end); /****** Line 4 added here for BG partitions only ****** to maintain alphabetized output ******/ if (cluster_flags & CLUSTER_FLAG_BG) { xstrfmtcat(out, "Midplanes=%s", part_ptr->nodes); xstrcat(out, line_end); } /****** Line 5 ******/ if (part_ptr->default_time == INFINITE) xstrcat(out, "DefaultTime=UNLIMITED"); else if (part_ptr->default_time == NO_VAL) xstrcat(out, "DefaultTime=NONE"); else { char time_line[32]; secs2time_str(part_ptr->default_time * 60, time_line, sizeof(time_line)); xstrfmtcat(out, "DefaultTime=%s", time_line); } if (part_ptr->flags & PART_FLAG_NO_ROOT) xstrcat(out, " DisableRootJobs=YES"); else xstrcat(out, " DisableRootJobs=NO"); if (part_ptr->flags & PART_FLAG_EXCLUSIVE_USER) xstrcat(out, " ExclusiveUser=YES"); else xstrcat(out, " ExclusiveUser=NO"); xstrfmtcat(out, " GraceTime=%u", part_ptr->grace_time); if (part_ptr->flags & PART_FLAG_HIDDEN) xstrcat(out, " Hidden=YES"); else xstrcat(out, " Hidden=NO"); xstrcat(out, line_end); /****** Line 6 ******/ if (part_ptr->max_nodes == INFINITE) xstrcat(out, "MaxNodes=UNLIMITED"); else { if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)part_ptr->max_nodes, tmp, sizeof(tmp), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, "MaxNodes=%s", tmp); } else xstrfmtcat(out, "MaxNodes=%u", part_ptr->max_nodes); } if (part_ptr->max_time == INFINITE) xstrcat(out, " MaxTime=UNLIMITED"); else { char time_line[32]; secs2time_str(part_ptr->max_time * 60, time_line, sizeof(time_line)); xstrfmtcat(out, " MaxTime=%s", time_line); } if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)part_ptr->min_nodes, tmp, sizeof(tmp), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, " MinNodes=%s", tmp); } else xstrfmtcat(out, " MinNodes=%u", part_ptr->min_nodes); if (part_ptr->flags & PART_FLAG_LLN) xstrcat(out, " LLN=YES"); else xstrcat(out, " LLN=NO"); if (part_ptr->max_cpus_per_node == INFINITE) xstrcat(out, " MaxCPUsPerNode=UNLIMITED"); else { xstrfmtcat(out, " MaxCPUsPerNode=%u", part_ptr->max_cpus_per_node); } xstrcat(out, line_end); /****** Line added here for non BG nodes to keep with alphabetized output******/ if (!(cluster_flags & CLUSTER_FLAG_BG)) { xstrfmtcat(out, "Nodes=%s", part_ptr->nodes); xstrcat(out, line_end); } /****** Line 7 ******/ xstrfmtcat(out, "PriorityJobFactor=%u", part_ptr->priority_job_factor); xstrfmtcat(out, " PriorityTier=%u", part_ptr->priority_tier); if (part_ptr->flags & PART_FLAG_ROOT_ONLY) xstrcat(out, " RootOnly=YES"); else xstrcat(out, " RootOnly=NO"); if (part_ptr->flags & PART_FLAG_REQ_RESV) xstrcat(out, " ReqResv=YES"); else xstrcat(out, " ReqResv=NO"); force = part_ptr->max_share & SHARED_FORCE; val = part_ptr->max_share & (~SHARED_FORCE); if (val == 0) xstrcat(out, " OverSubscribe=EXCLUSIVE"); else if (force) xstrfmtcat(out, " OverSubscribe=FORCE:%u", val); else if (val == 1) xstrcat(out, " OverSubscribe=NO"); else xstrfmtcat(out, " OverSubscribe=YES:%u", val); xstrcat(out, line_end); /****** Line ******/ if (part_ptr->over_time_limit == NO_VAL16) xstrfmtcat(out, "OverTimeLimit=NONE"); else if (part_ptr->over_time_limit == (uint16_t) INFINITE) xstrfmtcat(out, "OverTimeLimit=UNLIMITED"); else xstrfmtcat(out, "OverTimeLimit=%u", part_ptr->over_time_limit); preempt_mode = part_ptr->preempt_mode; if (preempt_mode == NO_VAL16) preempt_mode = slurm_get_preempt_mode(); /* use cluster param */ xstrfmtcat(out, " PreemptMode=%s", preempt_mode_string(preempt_mode)); xstrcat(out, line_end); /****** Line ******/ if (part_ptr->state_up == PARTITION_UP) xstrcat(out, "State=UP"); else if (part_ptr->state_up == PARTITION_DOWN) xstrcat(out, "State=DOWN"); else if (part_ptr->state_up == PARTITION_INACTIVE) xstrcat(out, "State=INACTIVE"); else if (part_ptr->state_up == PARTITION_DRAIN) xstrcat(out, "State=DRAIN"); else xstrcat(out, "State=UNKNOWN"); if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)part_ptr->total_cpus, tmp, sizeof(tmp), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, " TotalCPUs=%s", tmp); } else xstrfmtcat(out, " TotalCPUs=%u", part_ptr->total_cpus); if (cluster_flags & CLUSTER_FLAG_BG) { convert_num_unit((float)part_ptr->total_nodes, tmp, sizeof(tmp), UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT); xstrfmtcat(out, " TotalNodes=%s", tmp); } else xstrfmtcat(out, " TotalNodes=%u", part_ptr->total_nodes); xstrfmtcat(out, " SelectTypeParameters=%s", select_type_param_string(part_ptr->cr_type)); xstrcat(out, line_end); /****** Line 9 ******/ if (part_ptr->def_mem_per_cpu & MEM_PER_CPU) { if (part_ptr->def_mem_per_cpu == MEM_PER_CPU) { xstrcat(out, "DefMemPerCPU=UNLIMITED"); } else { xstrfmtcat(out, "DefMemPerCPU=%"PRIu64"", part_ptr->def_mem_per_cpu & (~MEM_PER_CPU)); } } else if (part_ptr->def_mem_per_cpu == 0) { xstrcat(out, "DefMemPerNode=UNLIMITED"); } else { xstrfmtcat(out, "DefMemPerNode=%"PRIu64"", part_ptr->def_mem_per_cpu); } if (part_ptr->max_mem_per_cpu & MEM_PER_CPU) { if (part_ptr->max_mem_per_cpu == MEM_PER_CPU) { xstrcat(out, " MaxMemPerCPU=UNLIMITED"); } else { xstrfmtcat(out, " MaxMemPerCPU=%"PRIu64"", part_ptr->max_mem_per_cpu & (~MEM_PER_CPU)); } } else if (part_ptr->max_mem_per_cpu == 0) { xstrcat(out, " MaxMemPerNode=UNLIMITED"); } else { xstrfmtcat(out, " MaxMemPerNode=%"PRIu64"", part_ptr->max_mem_per_cpu); } /****** Line 10 ******/ if (part_ptr->billing_weights_str) { xstrcat(out, line_end); xstrfmtcat(out, "TRESBillingWeights=%s", part_ptr->billing_weights_str); } if (one_liner) xstrcat(out, "\n"); else xstrcat(out, "\n\n"); return out; }
static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr, **bf_part_ptr = NULL; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end, window_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; struct part_record *reject_array_part = NULL; uint32_t job_start_cnt = 0, start_time; time_t config_update = slurmctld_conf.last_update; time_t part_update = last_part_update; struct timeval start_tv; bf_last_yields = 0; #ifdef HAVE_ALPS_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1000000); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); else debug("backfill: beginning"); sched_start = now = time(NULL); gettimeofday(&start_tv, NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true, true); if (list_count(job_queue) == 0) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: no jobs to backfill"); else debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); non_cg_bitmap = bit_copy(cg_node_bitmap); bit_not(non_cg_bitmap); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt * 2 + 1)); node_space[0].begin_time = sched_start; window_end = sched_start + backfill_window; node_space[0].end_time = window_end; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); if (max_backfill_job_per_part) { ListIterator part_iterator; struct part_record *part_ptr; bf_parts = list_count(part_list); bf_part_ptr = xmalloc(sizeof(struct part_record *) * bf_parts); bf_part_jobs = xmalloc(sizeof(int) * bf_parts); part_iterator = list_iterator_create(part_list); i = 0; while ((part_ptr = (struct part_record *) list_next(part_iterator))) { bf_part_ptr[i++] = part_ptr; } list_iterator_destroy(part_iterator); } if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } sort_job_queue(job_queue); while (1) { job_queue_rec = (job_queue_rec_t *) list_pop(job_queue); if (!job_queue_rec) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: reached end of job queue"); break; } if (slurmctld_config.shutdown_time) break; if (((defer_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= defer_rpc_cnt)) || (_delta_tv(&start_tv) >= sched_timeout)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; xfree(job_queue_rec); break; } /* cg_node_bitmap may be changed */ bit_copybits(non_cg_bitmap, cg_node_bitmap); bit_not(non_cg_bitmap); /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 0; START_TIMER; } job_ptr = job_queue_rec->job_ptr; /* With bf_continue configured, the original job could have * been cancelled and purged. Validate pointer here. */ if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->job_id != job_queue_rec->job_id)) { xfree(job_queue_rec); continue; } orig_time_limit = job_ptr->time_limit; part_ptr = job_queue_rec->part_ptr; job_test_count++; slurmctld_diag_stats.bf_last_depth++; already_counted = false; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != NO_VAL) { if ((reject_array_job_id == job_ptr->array_job_id) && (reject_array_part == part_ptr)) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; reject_array_part = part_ptr; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill test for JobID=%u Prio=%u Partition=%s", job_ptr->job_id, job_ptr->priority, job_ptr->part_ptr->name); } if (max_backfill_job_per_part) { bool skip_job = false; for (j = 0; j < bf_parts; j++) { if (bf_part_ptr[j] != job_ptr->part_ptr) continue; if (bf_part_jobs[j]++ >= max_backfill_job_per_part) skip_job = true; break; } if (skip_job) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " "checked %u jobs for " "partition %s; skipping " "job %u", max_backfill_job_per_part, job_ptr->part_ptr->name, job_ptr->job_id); continue; } } if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ static bool bf_max_user_msg = true; if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else if (bf_max_user_msg) { bf_max_user_msg = false; error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] >= max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL) || ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: partition %s not usable", job_ptr->part_ptr->name); continue; } if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u not runable now", job_ptr->job_id); continue; } /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u node count too high", job_ptr->job_id); continue; } /* Determine job's expected completion time */ if (part_ptr->max_time == INFINITE) part_time_limit = 365 * 24 * 60; /* one year */ else part_time_limit = part_ptr->max_time; if (job_ptr->time_limit == NO_VAL) { time_limit = part_time_limit; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_time_limit); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if (slurmctld_config.shutdown_time) break; if (((defer_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= defer_rpc_cnt)) || (_delta_tv(&start_tv) >= sched_timeout)) { uint32_t save_job_id = job_ptr->job_id; uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; break; } /* cg_node_bitmap may be changed */ bit_copybits(non_cg_bitmap, cg_node_bitmap); bit_not(non_cg_bitmap); /* With bf_continue configured, the original job could * have been scheduled or cancelled and purged. * Revalidate job the record here. */ if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->job_id != save_job_id)) continue; if (!IS_JOB_PENDING(job_ptr)) continue; if (!avail_front_end(job_ptr)) continue; /* No available frontend */ job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: job %u reservation defer", job_ptr->job_id); job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); bit_and(avail_bitmap, non_cg_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features OR * no change since previously tested nodes (only changes * in other partition nodes) */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_job_test(job_ptr, avail_bitmap, start_res); j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; uint32_t hard_limit; bool reset_time = false; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) { acct_policy_alter_job( job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; } else { acct_policy_alter_job( job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; reset_time = true; } else if (orig_time_limit == NO_VAL) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; } else { acct_policy_alter_job(job_ptr, orig_time_limit); job_ptr->time_limit = orig_time_limit; } if (job_ptr->time_limit == INFINITE) hard_limit = 365 * 24 * 60; /* one year */ else hard_limit = job_ptr->time_limit; job_ptr->end_time = job_ptr->start_time + (hard_limit * 60); if (reset_time) { _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: planned start of job %u" " failed: %s", job_ptr->job_id, slurm_strerror(rc)); } /* Drop through and reserve these resources. * Likely due to state changes during sleep. * Make best-effort based upon original state */ job_ptr->time_limit = orig_time_limit; later_start = 0; } else { /* Started this job, move to next one */ reject_array_job_id = 0; reject_array_part = NULL; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); job_start_cnt++; if (max_backfill_jobs_start && (job_start_cnt >= max_backfill_jobs_start)){ if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: bf_max_job_start" " limit of %d reached", max_backfill_jobs_start); } break; } continue; } } else { job_ptr->time_limit = orig_time_limit; } start_time = job_ptr->start_time; end_reserve = job_ptr->start_time + (time_limit * 60); start_time = (start_time / backfill_resolution) * backfill_resolution; end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; if (later_start && (start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); continue; } if (node_space_recs >= max_backfill_job_cnt) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: table size limit of %u reached", max_backfill_job_cnt); } break; } if ((job_ptr->start_time > now) && _test_resv_overlap(node_space, avail_bitmap, start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; reject_array_part = NULL; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); xfree(job_ptr->sched_nodes); job_ptr->sched_nodes = bitmap2node_name(avail_bitmap); bit_not(avail_bitmap); _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); } xfree(bf_part_jobs); xfree(bf_part_ptr); xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); FREE_NULL_BITMAP(non_cg_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } return rc; }