/* Create a reservation for a job in the future */ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, bitstr_t *res_bitmap, node_space_map_t *node_space, int *node_space_recs) { bool placed = false; int i, j; for (j=0; ; ) { if (node_space[j].end_time > start_time) { /* insert start entry record */ i = *node_space_recs; node_space[i].begin_time = start_time; node_space[i].end_time = node_space[j].end_time; node_space[j].end_time = start_time; node_space[i].avail_bitmap = bit_copy(node_space[j].avail_bitmap); node_space[i].next = node_space[j].next; node_space[j].next = i; (*node_space_recs)++; placed = true; } if (node_space[j].end_time == start_time) { /* no need to insert new start entry record */ placed = true; } if (placed == true) { j = node_space[j].next; if (j && (end_reserve < node_space[j].end_time)) { /* insert end entry record */ i = *node_space_recs; node_space[i].begin_time = end_reserve; node_space[i].end_time = node_space[j]. end_time; node_space[j].end_time = end_reserve; node_space[i].avail_bitmap = bit_copy(node_space[j].avail_bitmap); node_space[i].next = node_space[j].next; node_space[j].next = i; (*node_space_recs)++; } break; } if ((j = node_space[j].next) == 0) break; } for (j=0; ; ) { if ((node_space[j].begin_time >= start_time) && (node_space[j].end_time <= end_reserve)) bit_and(node_space[j].avail_bitmap, res_bitmap); if ((node_space[j].begin_time >= end_reserve) || ((j = node_space[j].next) == 0)) break; } }
/* Try to start the job on any non-reserved nodes */ static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap) { int rc; bitstr_t *orig_exc_nodes = NULL; static uint32_t fail_jobid = 0; if (job_ptr->details->exc_node_bitmap) { orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap); bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap); } else job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap); rc = select_nodes(job_ptr, false, NULL); if (job_ptr->details) { /* select_nodes() might cancel the job! */ FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); job_ptr->details->exc_node_bitmap = orig_exc_nodes; } else FREE_NULL_BITMAP(orig_exc_nodes); if (rc == SLURM_SUCCESS) { /* job initiated */ last_job_update = time(NULL); info("backfill: Started JobId=%u on %s", job_ptr->job_id, job_ptr->nodes); if (job_ptr->batch_flag == 0) srun_allocate(job_ptr->job_id); else if ((job_ptr->details == NULL) || (job_ptr->details->prolog_running == 0)) launch_job(job_ptr); slurmctld_diag_stats.backfilled_jobs++; slurmctld_diag_stats.last_backfilled_jobs++; if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: Jobs backfilled since boot: %u", slurmctld_diag_stats.backfilled_jobs); } } else if ((job_ptr->job_id != fail_jobid) && (rc != ESLURM_ACCOUNTING_POLICY)) { char *node_list; bit_not(resv_bitmap); node_list = bitmap2node_name(resv_bitmap); /* This happens when a job has sharing disabled and * a selected node is still completing some job, * which should be a temporary situation. */ verbose("backfill: Failed to start JobId=%u on %s: %s", job_ptr->job_id, node_list, slurm_strerror(rc)); xfree(node_list); fail_jobid = job_ptr->job_id; } else { debug3("backfill: Failed to start JobId=%u: %s", job_ptr->job_id, slurm_strerror(rc)); } return rc; }
/* Distribute MPS Count to records on original list */ static void _distribute_count(List gres_conf_list, List gpu_conf_list, uint64_t count) { ListIterator gpu_itr; gres_slurmd_conf_t *gpu_record, *mps_record; int rem_gpus = list_count(gpu_conf_list); gpu_itr = list_iterator_create(gpu_conf_list); while ((gpu_record = list_next(gpu_itr))) { mps_record = xmalloc(sizeof(gres_slurmd_conf_t)); mps_record->config_flags = gpu_record->config_flags; mps_record->count = count / rem_gpus; count -= mps_record->count; rem_gpus--; mps_record->cpu_cnt = gpu_record->cpu_cnt; mps_record->cpus = xstrdup(gpu_record->cpus); if (gpu_record->cpus_bitmap) { mps_record->cpus_bitmap = bit_copy(gpu_record->cpus_bitmap); } mps_record->file = xstrdup(gpu_record->file); mps_record->name = xstrdup("mps"); mps_record->plugin_id = gres_plugin_build_id("mps"); mps_record->type_name = xstrdup(gpu_record->type_name); list_append(gres_conf_list, mps_record); list_append(gres_conf_list, gpu_record); (void) list_remove(gpu_itr); } list_iterator_destroy(gpu_itr); }
/* Return 1 if job fits in this row, else return 0 */ static int _job_fits_in_active_row(struct job_record *job_ptr, struct gs_part *p_ptr) { job_resources_t *job_res = job_ptr->job_resrcs; int count; bitstr_t *job_map; uint16_t job_gr_type; if ((p_ptr->active_resmap == NULL) || (p_ptr->jobs_active == 0)) return 1; job_gr_type = _get_part_gr_type(job_ptr->part_ptr); if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) || (job_gr_type == GS_SOCKET)) { return job_fits_into_cores(job_res, p_ptr->active_resmap, gs_bits_per_node); } /* job_gr_type == GS_NODE || job_gr_type == GS_CPU */ job_map = bit_copy(job_res->node_bitmap); bit_and(job_map, p_ptr->active_resmap); /* any set bits indicate contention for the same resource */ count = bit_set_count(job_map); if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) info("gang: _job_fits_in_active_row: %d bits conflict", count); FREE_NULL_BITMAP(job_map); if (count == 0) return 1; if (job_gr_type == GS_CPU) { /* For GS_CPU we check the CPU arrays */ return _can_cpus_fit(job_ptr, p_ptr); } return 0; }
int powercap_get_job_optimal_cpufreq(uint32_t powercap, int *allowed_freqs) { uint32_t cur_max_watts = 0, *tmp_max_watts_dvfs = NULL; int k = 1; bitstr_t *tmp_bitmap = NULL; if (!_powercap_enabled()) return 0; tmp_max_watts_dvfs = xmalloc(sizeof(uint32_t) * (allowed_freqs[0]+1)); tmp_bitmap = bit_copy(idle_node_bitmap); bit_not(tmp_bitmap); cur_max_watts = powercap_get_node_bitmap_maxwatts_dvfs(tmp_bitmap, idle_node_bitmap,tmp_max_watts_dvfs,allowed_freqs,0); if (cur_max_watts > powercap) { while (tmp_max_watts_dvfs[k] > powercap && k < allowed_freqs[0] + 1) { k++; } if (k == allowed_freqs[0] + 1) k--; } else k = 1; return k; }
/* * Build record used to set environment variables as appropriate for a job's * prolog or epilog based GRES allocated to the job. */ extern gres_epilog_info_t *epilog_build_env(gres_job_state_t *gres_job_ptr) { int i; gres_epilog_info_t *epilog_info; epilog_info = xmalloc(sizeof(gres_epilog_info_t)); epilog_info->node_cnt = gres_job_ptr->node_cnt; epilog_info->gres_bit_alloc = xcalloc(epilog_info->node_cnt, sizeof(bitstr_t *)); epilog_info->gres_cnt_node_alloc = xcalloc(epilog_info->node_cnt, sizeof(uint64_t)); for (i = 0; i < epilog_info->node_cnt; i++) { if (gres_job_ptr->gres_bit_alloc && gres_job_ptr->gres_bit_alloc[i]) { epilog_info->gres_bit_alloc[i] = bit_copy(gres_job_ptr->gres_bit_alloc[i]); } if (gres_job_ptr->gres_bit_alloc && gres_job_ptr->gres_bit_alloc[i]) { epilog_info->gres_cnt_node_alloc[i] = gres_job_ptr->gres_cnt_node_alloc[i]; } } return epilog_info; }
static void _add_config_feature(char *feature, bitstr_t *node_bitmap) { struct features_record *feature_ptr; ListIterator feature_iter; bool match = false; /* If feature already exists in feature_list, just update the bitmap */ feature_iter = list_iterator_create(feature_list); while ((feature_ptr = (struct features_record *) list_next(feature_iter))) { if (strcmp(feature, feature_ptr->name)) continue; bit_or(feature_ptr->node_bitmap, node_bitmap); match = true; break; } list_iterator_destroy(feature_iter); if (!match) { /* Need to create new feature_list record */ feature_ptr = xmalloc(sizeof(struct features_record)); feature_ptr->magic = FEATURE_MAGIC; feature_ptr->name = xstrdup(feature); feature_ptr->node_bitmap = bit_copy(node_bitmap); list_append(feature_list, feature_ptr); } }
static unsigned buffer_write(struct mpsse_ctx *ctx, const uint8_t *out, unsigned out_offset, unsigned bit_count) { DEBUG_IO("%d bits", bit_count); assert(ctx->write_count + DIV_ROUND_UP(bit_count, 8) <= ctx->write_size); bit_copy(ctx->write_buffer + ctx->write_count, 0, out, out_offset, bit_count); ctx->write_count += DIV_ROUND_UP(bit_count, 8); return bit_count; }
/* * Convert all GPU records to a new entries in a list where each File is a * unique device (i.e. convert a record with "File=nvidia[0-3]" into 4 separate * records). */ static List _build_gpu_list(List gres_list) { ListIterator itr; gres_slurmd_conf_t *gres_record, *gpu_record; List gpu_list; hostlist_t hl; char *f_name; bool log_fname = true; if (gres_list == NULL) return NULL; gpu_list = list_create(_delete_gres_list); itr = list_iterator_create(gres_list); while ((gres_record = list_next(itr))) { if (xstrcmp(gres_record->name, "gpu")) continue; if (!gres_record->file) { if (log_fname) { error("%s: GPU configuration lacks \"File\" specification", plugin_name); log_fname = false; } continue; } hl = hostlist_create(gres_record->file); while ((f_name = hostlist_shift(hl))) { gpu_record = xmalloc(sizeof(gres_slurmd_conf_t)); gpu_record->config_flags = gres_record->config_flags; if (gres_record->type_name) { gpu_record->config_flags |= GRES_CONF_HAS_TYPE; } gpu_record->count = 1; gpu_record->cpu_cnt = gres_record->cpu_cnt; gpu_record->cpus = xstrdup(gres_record->cpus); if (gres_record->cpus_bitmap) { gpu_record->cpus_bitmap = bit_copy(gres_record->cpus_bitmap); } gpu_record->file = xstrdup(f_name); gpu_record->links = xstrdup(gres_record->links); gpu_record->name = xstrdup(gres_record->name); gpu_record->plugin_id = gres_record->plugin_id; gpu_record->type_name = xstrdup(gres_record->type_name); list_append(gpu_list, gpu_record); free(f_name); } hostlist_destroy(hl); (void) list_delete_item(itr); } list_iterator_destroy(itr); return gpu_list; }
uint32_t powercap_get_node_bitmap_maxwatts(bitstr_t *idle_bitmap) { uint32_t max_watts = 0, val; struct node_record *node_ptr; int i; bitstr_t *tmp_bitmap = NULL; if (!_powercap_enabled()) return 0; if (!power_layout_ready()) return 0; /* if no input bitmap, consider the current idle nodes * bitmap as the input bitmap tagging nodes to consider * as idle while computing the max watts of the cluster */ if (idle_bitmap == NULL) { tmp_bitmap = bit_copy(idle_node_bitmap); idle_bitmap = tmp_bitmap; } for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { /* non reserved node, evaluate the different cases */ if (bit_test(idle_bitmap, i)) { /* idle nodes, 2 cases : power save or not */ if (bit_test(power_node_bitmap, i)) { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_SAVE, &val, L_T_UINT32); } else { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_IDLE, &val, L_T_UINT32); } } else { /* non idle nodes, 2 cases : down or not */ if (!bit_test(up_node_bitmap, i)) { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_DOWN, &val, L_T_UINT32); } else { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_MAX, &val, L_T_UINT32); } } max_watts += val; } if (tmp_bitmap) bit_free(tmp_bitmap); return max_watts; }
/* copy a select job credential * IN jobinfo - the select job credential to be copied * RET - the copy or NULL on failure * NOTE: returned value must be freed using free_jobinfo */ extern select_jobinfo_t *copy_select_jobinfo(select_jobinfo_t *jobinfo) { struct select_jobinfo *rc = NULL; if (jobinfo == NULL) ; else if (jobinfo->magic != JOBINFO_MAGIC) error("copy_jobinfo: jobinfo magic bad"); else { rc = xmalloc(sizeof(struct select_jobinfo)); rc->dim_cnt = jobinfo->dim_cnt; memcpy(rc->geometry, jobinfo->geometry, sizeof(rc->geometry)); memcpy(rc->conn_type, jobinfo->conn_type, sizeof(rc->conn_type)); memcpy(rc->start_loc, jobinfo->start_loc, sizeof(rc->start_loc)); rc->reboot = jobinfo->reboot; rc->rotate = jobinfo->rotate; rc->bg_record = jobinfo->bg_record; rc->bg_block_id = xstrdup(jobinfo->bg_block_id); rc->magic = JOBINFO_MAGIC; rc->mp_str = xstrdup(jobinfo->mp_str); rc->ionode_str = xstrdup(jobinfo->ionode_str); rc->block_cnode_cnt = jobinfo->block_cnode_cnt; rc->cleaning = jobinfo->cleaning; rc->cnode_cnt = jobinfo->cnode_cnt; rc->altered = jobinfo->altered; rc->blrtsimage = xstrdup(jobinfo->blrtsimage); rc->linuximage = xstrdup(jobinfo->linuximage); rc->mloaderimage = xstrdup(jobinfo->mloaderimage); rc->ramdiskimage = xstrdup(jobinfo->ramdiskimage); if (jobinfo->units_avail) rc->units_avail = bit_copy(jobinfo->units_avail); if (jobinfo->units_used) rc->units_used = bit_copy(jobinfo->units_used); rc->user_name = xstrdup(jobinfo->user_name); } return rc; }
/* Clear active_feature_list, * then copy avail_feature_list into active_feature_list */ static void _copy_feature_list(void) { node_feature_t *active_feature_ptr, *avail_feature_ptr; ListIterator feature_iter; (void) list_delete_all(active_feature_list, &_list_find_feature, NULL); feature_iter = list_iterator_create(avail_feature_list); while ((avail_feature_ptr = (node_feature_t *)list_next(feature_iter))){ active_feature_ptr = xmalloc(sizeof(node_feature_t)); active_feature_ptr->magic = FEATURE_MAGIC; active_feature_ptr->name = xstrdup(avail_feature_ptr->name); active_feature_ptr->node_bitmap = bit_copy(avail_feature_ptr->node_bitmap); list_append(active_feature_list, active_feature_ptr); } list_iterator_destroy(feature_iter); }
static void _build_select_struct(struct job_record *job_ptr, bitstr_t *bitmap, uint32_t node_cnt) { int i; uint32_t total_cpus = 0; job_resources_t *job_resrcs_ptr; xassert(job_ptr); if (job_ptr->job_resrcs) { error("select_p_job_test: already have select_job"); free_job_resources(&job_ptr->job_resrcs); } job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources(); job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t)); job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t)); job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt); job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt); /* job_resrcs_ptr->nhosts = node_cnt; */ job_resrcs_ptr->nhosts = bit_set_count(bitmap); job_resrcs_ptr->ncpus = job_ptr->details->min_cpus; job_resrcs_ptr->node_bitmap = bit_copy(bitmap); job_resrcs_ptr->nodes = bitmap2node_name(bitmap); if (job_resrcs_ptr->node_bitmap == NULL) fatal("bit_copy malloc failure"); job_resrcs_ptr->cpu_array_cnt = 1; if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp) job_resrcs_ptr->cpu_array_value[0] = job_ptr->details->min_cpus; else job_resrcs_ptr->cpu_array_value[0] = bg_conf->cpus_per_mp; job_resrcs_ptr->cpu_array_reps[0] = node_cnt; total_cpus = bg_conf->cpu_ratio * node_cnt; for (i=0; i<node_cnt; i++) job_resrcs_ptr->cpus[i] = bg_conf->cpu_ratio; if (job_resrcs_ptr->ncpus != total_cpus) { error("select_p_job_test: ncpus mismatch %u != %u", job_resrcs_ptr->ncpus, total_cpus); } }
static int _attempt_backfill(void) { DEF_TIMERS; bool filter_root = false; List job_queue; job_queue_rec_t *job_queue_rec; slurmdb_qos_rec_t *qos_ptr = NULL; int i, j, node_space_recs; struct job_record *job_ptr; struct part_record *part_ptr; uint32_t end_time, end_reserve; uint32_t time_limit, comp_time_limit, orig_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int sched_timeout = 2, yield_sleep = 1; int rc = 0; int job_test_count = 0; uint32_t *uid = NULL, nuser = 0; uint16_t *njobs = NULL; bool already_counted; uint32_t reject_array_job_id = 0; #ifdef HAVE_CRAY /* * Run a Basil Inventory immediately before setting up the schedule * plan, to avoid race conditions caused by ALPS node state change. * Needs to be done with the node-state lock taken. */ START_TIMER; if (select_g_reconfigure()) { debug4("backfill: not scheduling due to ALPS"); return SLURM_SUCCESS; } END_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: ALPS inventory completed, %s", TIME_STR); /* The Basil inventory can take a long time to complete. Process * pending RPCs before starting the backfill scheduling logic */ _yield_locks(1); #endif START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); sched_start = now = time(NULL); if (slurm_get_root_filter()) filter_root = true; job_queue = build_job_queue(true); if (list_count(job_queue) == 0) { debug("backfill: no jobs to backfill"); list_destroy(job_queue); return 0; } gettimeofday(&bf_time1, NULL); slurmctld_diag_stats.bf_queue_len = list_count(job_queue); slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; bf_last_yields = 0; slurmctld_diag_stats.bf_active = 1; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt + 3)); node_space[0].begin_time = sched_start; node_space[0].end_time = sched_start + backfill_window; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); if (max_backfill_job_per_user) { uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t)); njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t)); } while ((job_queue_rec = (job_queue_rec_t *) list_pop_bottom(job_queue, sort_job_queue2))) { job_ptr = job_queue_rec->job_ptr; orig_time_limit = job_ptr->time_limit; if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks " "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 0; START_TIMER; } part_ptr = job_queue_rec->part_ptr; job_test_count++; xfree(job_queue_rec); if (!IS_JOB_PENDING(job_ptr)) continue; /* started in other partition */ if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (job_ptr->array_task_id != (uint16_t) NO_VAL) { if (reject_array_job_id == job_ptr->array_job_id) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job_id = job_ptr->array_job_id; } job_ptr->part_ptr = part_ptr; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill test for job %u", job_ptr->job_id); slurmctld_diag_stats.bf_last_depth++; already_counted = false; if (max_backfill_job_per_user) { for (j = 0; j < nuser; j++) { if (job_ptr->user_id == uid[j]) { njobs[j]++; if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: user %u: " "#jobs %u", uid[j], njobs[j]); break; } } if (j == nuser) { /* user not found */ if (nuser < BF_MAX_USERS) { uid[j] = job_ptr->user_id; njobs[j] = 1; nuser++; } else { error("backfill: too many users in " "queue. Consider increasing " "BF_MAX_USERS"); } if (debug_flags & DEBUG_FLAG_BACKFILL) debug2("backfill: found new user %u. " "Total #users now %u", job_ptr->user_id, nuser); } else { if (njobs[j] > max_backfill_job_per_user) { /* skip job */ if (debug_flags & DEBUG_FLAG_BACKFILL) debug("backfill: have already " "checked %u jobs for " "user %u; skipping " "job %u", max_backfill_job_per_user, job_ptr->user_id, job_ptr->job_id); continue; } } } if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) continue; if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root) continue; if ((!job_independent(job_ptr, 0)) || (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) continue; /* Determine minimum and maximum node counts */ min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partition's max_nodes */ continue; } /* Determine job's expected completion time */ if (job_ptr->time_limit == NO_VAL) { if (part_ptr->max_time == INFINITE) time_limit = 365 * 24 * 60; /* one year */ else time_limit = part_ptr->max_time; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_ptr->max_time); } comp_time_limit = time_limit; qos_ptr = job_ptr->qos_ptr; if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; /* Determine impact of any resource reservations */ later_start = now; TRY_LATER: if ((time(NULL) - sched_start) >= sched_timeout) { uint32_t save_time_limit = job_ptr->time_limit; job_ptr->time_limit = orig_time_limit; if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed yielding locks 2" "after testing %d jobs, %s", job_test_count, TIME_STR); } if (_yield_locks(yield_sleep) && !backfill_continue) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing %d " "jobs", job_test_count); } rc = 1; break; } job_ptr->time_limit = save_time_limit; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); job_test_count = 1; START_TIMER; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = later_start; later_start = 0; j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); for (j=0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) later_start = node_space[j].end_time; if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } if ((resv_end++) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap))) { if (later_start) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ job_ptr->time_limit = orig_time_limit; job_ptr->start_time = sched_start + backfill_window; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for job %u.", job_ptr->job_id); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); now = time(NULL); if (j != SLURM_SUCCESS) { job_ptr->time_limit = orig_time_limit; job_ptr->start_time = 0; continue; /* not runable */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } if (job_ptr->start_time <= now) { uint32_t save_time_limit = job_ptr->time_limit; int rc = _start_job(job_ptr, resv_bitmap); if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) { if (orig_time_limit == NO_VAL) job_ptr->time_limit = comp_time_limit; else job_ptr->time_limit = orig_time_limit; job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ job_ptr->time_limit = comp_time_limit; job_ptr->end_time = job_ptr->start_time + (comp_time_limit * 60); _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } else { job_ptr->time_limit = orig_time_limit; } if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time, just skip job */ job_ptr->start_time = 0; continue; } else if (rc != SLURM_SUCCESS) { /* Planned to start job, but something bad * happended. */ job_ptr->start_time = 0; break; } else { /* Started this job, move to next one */ reject_array_job_id = 0; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_g_job_start(acct_db_conn, job_ptr); continue; } } else job_ptr->time_limit = orig_time_limit; if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ job_ptr->start_time = 0; goto TRY_LATER; } if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ continue; } if (node_space_recs >= max_backfill_job_cnt) { /* Already have too many jobs to deal with */ break; } end_reserve = job_ptr->start_time + (time_limit * 60); if (_test_resv_overlap(node_space, avail_bitmap, job_ptr->start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; goto TRY_LATER; } /* * Add reservation to scheduling table if appropriate */ if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) continue; reject_array_job_id = 0; bit_not(avail_bitmap); _add_reservation(job_ptr->start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_node_space_table(node_space); } xfree(uid); xfree(njobs); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); for (i=0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); list_destroy(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, yield_sleep); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %d jobs, %s", job_test_count, TIME_STR); } return rc; }
/* Add the given job to the "active" structures of * the given partition and increment the run count */ static void _add_job_to_active(struct job_record *job_ptr, struct gs_part *p_ptr) { job_resources_t *job_res = job_ptr->job_resrcs; uint16_t job_gr_type; /* add job to active_resmap */ job_gr_type = _get_part_gr_type(job_ptr->part_ptr); if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) || (job_gr_type == GS_SOCKET)) { if (p_ptr->jobs_active == 0 && p_ptr->active_resmap) { uint32_t size = bit_size(p_ptr->active_resmap); bit_nclear(p_ptr->active_resmap, 0, size-1); } add_job_to_cores(job_res, &(p_ptr->active_resmap), gs_bits_per_node); if (job_gr_type == GS_SOCKET) _fill_sockets(job_res->node_bitmap, p_ptr); } else { /* GS_NODE or GS_CPU */ if (!p_ptr->active_resmap) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_active: job %u first", job_ptr->job_id); } p_ptr->active_resmap = bit_copy(job_res->node_bitmap); } else if (p_ptr->jobs_active == 0) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_active: job %u copied", job_ptr->job_id); } bit_copybits(p_ptr->active_resmap, job_res->node_bitmap); } else { if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) { info("gang: _add_job_to_active: adding job %u", job_ptr->job_id); } bit_or(p_ptr->active_resmap, job_res->node_bitmap); } } /* add job to the active_cpus array */ if (job_gr_type == GS_CPU) { uint32_t i, a, sz = bit_size(p_ptr->active_resmap); if (!p_ptr->active_cpus) { /* create active_cpus array */ p_ptr->active_cpus = xmalloc(sz * sizeof(uint16_t)); } if (p_ptr->jobs_active == 0) { /* overwrite the existing values in active_cpus */ for (a = 0, i = 0; i < sz; i++) { if (bit_test(job_res->node_bitmap, i)) { p_ptr->active_cpus[i] = job_res->cpus[a++]; } else { p_ptr->active_cpus[i] = 0; } } } else { /* add job to existing jobs in the active cpus */ for (a = 0, i = 0; i < sz; i++) { if (bit_test(job_res->node_bitmap, i)) { uint16_t limit = _get_phys_bit_cnt(i); p_ptr->active_cpus[i] += job_res->cpus[a++]; /* when adding shadows, the resources * may get overcommitted */ if (p_ptr->active_cpus[i] > limit) p_ptr->active_cpus[i] = limit; } } } } p_ptr->jobs_active += 1; }
/* Create a reservation for a job in the future */ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, bitstr_t *res_bitmap, node_space_map_t *node_space, int *node_space_recs) { bool placed = false; int i, j; /* If we decrease the resolution of our timing information, this can * decrease the number of records managed and increase performance */ start_time = (start_time / backfill_resolution) * backfill_resolution; end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; for (j=0; ; ) { if (node_space[j].end_time > start_time) { /* insert start entry record */ i = *node_space_recs; node_space[i].begin_time = start_time; node_space[i].end_time = node_space[j].end_time; node_space[j].end_time = start_time; node_space[i].avail_bitmap = bit_copy(node_space[j].avail_bitmap); node_space[i].next = node_space[j].next; node_space[j].next = i; (*node_space_recs)++; placed = true; } if (node_space[j].end_time == start_time) { /* no need to insert new start entry record */ placed = true; } if (placed == true) { j = node_space[j].next; if (j && (end_reserve < node_space[j].end_time)) { /* insert end entry record */ i = *node_space_recs; node_space[i].begin_time = end_reserve; node_space[i].end_time = node_space[j]. end_time; node_space[j].end_time = end_reserve; node_space[i].avail_bitmap = bit_copy(node_space[j].avail_bitmap); node_space[i].next = node_space[j].next; node_space[j].next = i; (*node_space_recs)++; } break; } if ((j = node_space[j].next) == 0) break; } for (j=0; ; ) { if ((node_space[j].begin_time >= start_time) && (node_space[j].end_time <= end_reserve)) bit_and(node_space[j].avail_bitmap, res_bitmap); if ((node_space[j].begin_time >= end_reserve) || ((j = node_space[j].next) == 0)) break; } }
/* Attempt to schedule a specific job on specific available nodes * IN job_ptr - job to schedule * IN/OUT avail_bitmap - nodes available/selected to use * IN exc_core_bitmap - cores which can not be used * RET SLURM_SUCCESS on success, otherwise an error code */ static int _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *exc_core_bitmap) { bitstr_t *tmp_bitmap; int rc = SLURM_SUCCESS; int feat_cnt = _num_feature_count(job_ptr); List preemptee_candidates = NULL; if (feat_cnt) { /* Ideally schedule the job feature by feature, * but I don't want to add that complexity here * right now, so clear the feature counts and try * to schedule. This will work if there is only * one feature count. It should work fairly well * in cases where there are multiple feature * counts. */ struct job_details *detail_ptr = job_ptr->details; ListIterator feat_iter; struct feature_record *feat_ptr; int i = 0, list_size; uint16_t *feat_cnt_orig = NULL, high_cnt = 0; /* Clear the feature counts */ list_size = list_count(detail_ptr->feature_list); feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size); feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { high_cnt = MAX(high_cnt, feat_ptr->count); feat_cnt_orig[i++] = feat_ptr->count; feat_ptr->count = 0; } list_iterator_destroy(feat_iter); if ((job_req_node_filter(job_ptr, *avail_bitmap) != SLURM_SUCCESS) || (bit_set_count(*avail_bitmap) < high_cnt)) { rc = ESLURM_NODES_BUSY; } else { preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); rc = select_g_job_test(job_ptr, *avail_bitmap, high_cnt, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } /* Restore the feature counts */ i = 0; feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (struct feature_record *) list_next(feat_iter))) { feat_ptr->count = feat_cnt_orig[i++]; } list_iterator_destroy(feat_iter); xfree(feat_cnt_orig); } else { /* Try to schedule the job. First on dedicated nodes * then on shared nodes (if so configured). */ uint16_t orig_shared; time_t now = time(NULL); char str[100]; preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_shared = job_ptr->details->shared; job_ptr->details->shared = 0; tmp_bitmap = bit_copy(*avail_bitmap); if (exc_core_bitmap) { bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap); debug2(" _try_sched with exclude core bitmap: %s",str); } rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); job_ptr->details->shared = orig_shared; if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap= tmp_bitmap; rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } else FREE_NULL_BITMAP(tmp_bitmap); } if (preemptee_candidates) list_destroy(preemptee_candidates); return rc; }
/* * _build_part_bitmap - update the total_cpus, total_nodes, and node_bitmap * for the specified partition, also reset the partition pointers in * the node back to this partition. * IN part_ptr - pointer to the partition * RET 0 if no error, errno otherwise * global: node_record_table_ptr - pointer to global node table * NOTE: this does not report nodes defined in more than one partition. this * is checked only upon reading the configuration file, not on an update */ static int _build_part_bitmap(struct part_record *part_ptr) { char *this_node_name; bitstr_t *old_bitmap; struct node_record *node_ptr; /* pointer to node_record */ hostlist_t host_list; part_ptr->total_cpus = 0; part_ptr->total_nodes = 0; if (part_ptr->node_bitmap == NULL) { part_ptr->node_bitmap = bit_alloc(node_record_count); old_bitmap = NULL; } else { old_bitmap = bit_copy(part_ptr->node_bitmap); bit_nclear(part_ptr->node_bitmap, 0, node_record_count - 1); } if (part_ptr->nodes == NULL) { /* no nodes in partition */ _unlink_free_nodes(old_bitmap, part_ptr); FREE_NULL_BITMAP(old_bitmap); return 0; } if ((host_list = hostlist_create(part_ptr->nodes)) == NULL) { FREE_NULL_BITMAP(old_bitmap); error("hostlist_create error on %s, %m", part_ptr->nodes); return ESLURM_INVALID_NODE_NAME; } while ((this_node_name = hostlist_shift(host_list))) { node_ptr = find_node_record(this_node_name); if (node_ptr == NULL) { error("_build_part_bitmap: invalid node name %s", this_node_name); free(this_node_name); FREE_NULL_BITMAP(old_bitmap); hostlist_destroy(host_list); return ESLURM_INVALID_NODE_NAME; } part_ptr->total_nodes++; if (slurmctld_conf.fast_schedule) part_ptr->total_cpus += node_ptr->config_ptr->cpus; else part_ptr->total_cpus += node_ptr->cpus; node_ptr->part_cnt++; xrealloc(node_ptr->part_pptr, (node_ptr->part_cnt * sizeof(struct part_record *))); node_ptr->part_pptr[node_ptr->part_cnt-1] = part_ptr; if (old_bitmap) bit_clear(old_bitmap, (int) (node_ptr - node_record_table_ptr)); bit_set(part_ptr->node_bitmap, (int) (node_ptr - node_record_table_ptr)); free(this_node_name); } hostlist_destroy(host_list); _unlink_free_nodes(old_bitmap, part_ptr); last_node_update = time(NULL); FREE_NULL_BITMAP(old_bitmap); return 0; }
/* * Attempt to start a job * jobid (IN) - job id * task_cnt (IN) - total count of tasks to start * hostlist (IN) - SLURM hostlist expression with no repeated hostnames * tasklist (IN/OUT) - comma separated list of hosts with tasks to be started, * list hostname once per task to start * comment_ptr (IN) - new comment field for the job or NULL for no change * err_code (OUT) - Moab error code * err_msg (OUT) - Moab error message */ static int _start_job(uint32_t jobid, int task_cnt, char *hostlist, char *tasklist, char *comment_ptr, int *err_code, char **err_msg) { int rc = 0, old_task_cnt = 1; struct job_record *job_ptr; /* Write lock on job info, read lock on node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK }; char *new_node_list = NULL; static char tmp_msg[128]; bitstr_t *new_bitmap = (bitstr_t *) NULL; bitstr_t *save_req_bitmap = (bitstr_t *) NULL; bitoff_t i, bsize; int ll; /* layout info index */ char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL; size_t node_name_len; static uint32_t cr_test = 0, cr_enabled = 0; if (cr_test == 0) { select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL, &cr_enabled); cr_test = 1; } lock_slurmctld(job_write_lock); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); rc = -1; goto fini; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "Job not pending, can't start"; error("wiki: Attempt to start job %u in state %s", jobid, job_state_string(job_ptr->job_state)); rc = -1; goto fini; } if (comment_ptr) { char *reserved = strstr(comment_ptr, "RESERVED:"); if (reserved) { reserved += 9; job_ptr->details->reserved_resources = strtol(reserved, NULL, 10); } xfree(job_ptr->comment); job_ptr->comment = xstrdup(comment_ptr); } if (task_cnt) { new_node_list = xstrdup(hostlist); if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid TASKLIST"; error("wiki: Attempt to set invalid node list for " "job %u, %s", jobid, hostlist); xfree(new_node_list); rc = -1; goto fini; } if (!bit_super_set(new_bitmap, avail_node_bitmap)) { /* Selected node is UP and not responding * or it just went DOWN */ *err_code = -700; *err_msg = "TASKLIST includes non-responsive node"; error("wiki: Attempt to use non-responsive nodes for " "job %u, %s", jobid, hostlist); xfree(new_node_list); FREE_NULL_BITMAP(new_bitmap); rc = -1; goto fini; } /* User excluded node list incompatible with Wiki * Exclude all nodes not explicitly requested */ FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } /* Build layout information from tasklist (assuming that Moab * sends a non-bracketed list of nodes, repeated as many times * as cpus should be used per node); at this point, node names * are comma-separated. This is _not_ a fast algorithm as it * performs many string compares. */ xfree(job_ptr->details->req_node_layout); if (task_cnt && cr_enabled) { uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task); job_ptr->details->req_node_layout = (uint16_t *) xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t)); bsize = bit_size(new_bitmap); for (i = 0, ll = -1; i < bsize; i++) { if (!bit_test(new_bitmap, i)) continue; ll++; node_name = node_record_table_ptr[i].name; node_name_len = strlen(node_name); if (node_name_len == 0) continue; node_cur = tasklist; while (*node_cur) { if ((node_idx = strstr(node_cur, node_name))) { if ((node_idx[node_name_len] == ',') || (node_idx[node_name_len] == '\0')) { job_ptr->details-> req_node_layout[ll] += cpus_per_task; } node_cur = strchr(node_idx, ','); if (node_cur) continue; } break; } } } /* save and update job state to start now */ save_req_nodes = job_ptr->details->req_nodes; job_ptr->details->req_nodes = new_node_list; save_req_bitmap = job_ptr->details->req_node_bitmap; job_ptr->details->req_node_bitmap = new_bitmap; old_task_cnt = job_ptr->details->min_cpus; job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt); job_ptr->priority = 100000000; fini: unlock_slurmctld(job_write_lock); if (rc) return rc; /* No errors so far */ (void) schedule(INFINITE); /* provides own locking */ /* Check to insure the job was actually started */ lock_slurmctld(job_write_lock); if (job_ptr->job_id != jobid) job_ptr = find_job_record(jobid); if (job_ptr && (job_ptr->job_id == jobid) && (!IS_JOB_RUNNING(job_ptr))) { uint16_t wait_reason = 0; char *wait_string; if (IS_JOB_FAILED(job_ptr)) wait_string = "Invalid request, job aborted"; else { wait_reason = job_ptr->state_reason; if (wait_reason == WAIT_HELD) { /* some job is completing, slurmctld did * not even try to schedule this job */ wait_reason = WAIT_RESOURCES; } wait_string = job_reason_string(wait_reason); job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } *err_code = -910 - wait_reason; snprintf(tmp_msg, sizeof(tmp_msg), "Could not start job %u(%s): %s", jobid, new_node_list, wait_string); *err_msg = tmp_msg; error("wiki: %s", tmp_msg); /* restore some of job state */ job_ptr->priority = 0; job_ptr->details->min_cpus = old_task_cnt; rc = -1; } if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) { /* Restore required node list in case job requeued */ xfree(job_ptr->details->req_nodes); job_ptr->details->req_nodes = save_req_nodes; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); job_ptr->details->req_node_bitmap = save_req_bitmap; FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); xfree(job_ptr->details->req_node_layout); } else { error("wiki: start_job(%u) job missing", jobid); xfree(save_req_nodes); FREE_NULL_BITMAP(save_req_bitmap); } unlock_slurmctld(job_write_lock); schedule_node_save(); /* provides own locking */ schedule_job_save(); /* provides own locking */ return rc; }
extern int select_nodeinfo_get(select_nodeinfo_t *nodeinfo, enum select_nodedata_type dinfo, enum node_states state, void *data) { int rc = SLURM_SUCCESS; uint16_t *uint16 = (uint16_t *) data; uint32_t *uint32 = (uint32_t *) data; bitstr_t **bitmap = (bitstr_t **) data; char **tmp_char = (char **) data; ListIterator itr = NULL; node_subgrp_t *subgrp = NULL; if (nodeinfo == NULL) { error("get_nodeinfo: nodeinfo not set"); return SLURM_ERROR; } if (nodeinfo->magic != NODEINFO_MAGIC) { error("get_nodeinfo: nodeinfo magic bad"); return SLURM_ERROR; } switch (dinfo) { case SELECT_NODEDATA_BITMAP_SIZE: *uint16 = nodeinfo->bitmap_size; break; case SELECT_NODEDATA_SUBGRP_SIZE: *uint16 = 0; if (!nodeinfo->subgrp_list) return SLURM_ERROR; *uint16 = list_count(nodeinfo->subgrp_list); break; case SELECT_NODEDATA_SUBCNT: *uint16 = 0; if (!nodeinfo->subgrp_list) return SLURM_ERROR; itr = list_iterator_create(nodeinfo->subgrp_list); while ((subgrp = list_next(itr))) { if (subgrp->state == state) { *uint16 = subgrp->cnode_cnt; break; } } list_iterator_destroy(itr); break; case SELECT_NODEDATA_BITMAP: *bitmap = NULL; if (!nodeinfo->subgrp_list) return SLURM_ERROR; itr = list_iterator_create(nodeinfo->subgrp_list); while ((subgrp = list_next(itr))) { if (subgrp->state == state) { *bitmap = bit_copy(subgrp->bitmap); break; } } list_iterator_destroy(itr); break; case SELECT_NODEDATA_RACK_MP: if (nodeinfo->ba_mp) *tmp_char = xstrdup(nodeinfo->ba_mp->loc); else if (nodeinfo->rack_mp) *tmp_char = xstrdup(nodeinfo->rack_mp); break; case SELECT_NODEDATA_STR: *tmp_char = NULL; if (!nodeinfo->subgrp_list) return SLURM_ERROR; itr = list_iterator_create(nodeinfo->subgrp_list); while ((subgrp = list_next(itr))) { if (subgrp->state == state) { *tmp_char = xstrdup(subgrp->str); break; } } list_iterator_destroy(itr); break; case SELECT_NODEDATA_EXTRA_INFO: if (nodeinfo->extra_info) *tmp_char = xstrdup(nodeinfo->extra_info); if (nodeinfo->failed_cnodes) xstrfmtcat(*tmp_char, "Failed cnodes=%s", nodeinfo->failed_cnodes); break; case SELECT_NODEDATA_MEM_ALLOC: *uint32 = 0; break; default: error("Unsupported option %d for get_nodeinfo.", dinfo); rc = SLURM_ERROR; break; } return rc; }
extern bg_record_t *create_small_record(bg_record_t *bg_record, bitstr_t *ionodes, int size) { bg_record_t *found_record = NULL; ba_mp_t *new_ba_mp = NULL; ba_mp_t *ba_mp = NULL; found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); found_record->magic = BLOCK_MAGIC; /* This will be a list containing jobs running on this block */ if (bg_conf->sub_blocks) found_record->job_list = list_create(NULL); found_record->job_running = NO_JOB_RUNNING; #ifdef HAVE_BGL found_record->node_use = SELECT_COPROCESSOR_MODE; found_record->blrtsimage = xstrdup(bg_record->blrtsimage); #endif #ifdef HAVE_BG_L_P found_record->linuximage = xstrdup(bg_record->linuximage); found_record->ramdiskimage = xstrdup(bg_record->ramdiskimage); #endif found_record->mloaderimage = xstrdup(bg_record->mloaderimage); if (bg_record->conn_type[0] >= SELECT_SMALL) found_record->conn_type[0] = bg_record->conn_type[0]; else found_record->conn_type[0] = SELECT_SMALL; xassert(bg_conf->cpu_ratio); found_record->cpu_cnt = bg_conf->cpu_ratio * size; found_record->cnode_cnt = size; found_record->ionode_bitmap = bit_copy(ionodes); ba_set_ionode_str(found_record); found_record->ba_mp_list = list_create(destroy_ba_mp); slurm_mutex_lock(&ba_system_mutex); if (bg_record->ba_mp_list) ba_mp = list_peek(bg_record->ba_mp_list); if (!ba_mp) { if (bg_record->mp_str) { int j = 0, dim; char *nodes = bg_record->mp_str; uint16_t coords[SYSTEM_DIMENSIONS]; while (nodes[j] != '\0') { if ((nodes[j] >= '0' && nodes[j] <= '9') || (nodes[j] >= 'A' && nodes[j] <= 'Z')) { break; } j++; } if (nodes[j] && ((strlen(nodes) - (j + SYSTEM_DIMENSIONS)) >= 0)) { for (dim = 0; dim < SYSTEM_DIMENSIONS; dim++, j++) coords[dim] = select_char2coord( nodes[j]); ba_mp = coord2ba_mp(coords); } error("you gave me a list with no ba_mps using %s", ba_mp->coord_str); } else { ba_mp = coord2ba_mp(found_record->start); error("you gave me a record with no ba_mps " "and no nodes either using %s", ba_mp->coord_str); } } xassert(ba_mp); new_ba_mp = ba_copy_mp(ba_mp); slurm_mutex_unlock(&ba_system_mutex); /* We need to have this node wrapped in Q to handle wires correctly when creating around the midplane. */ ba_setup_mp(new_ba_mp, false, true); new_ba_mp->used = BA_MP_USED_TRUE; /* Create these now so we can deal with error cnodes if/when they happen. Since this is the easiest place to figure it out for blocks that don't use the entire block */ if ((new_ba_mp->cnode_bitmap = ba_create_ba_mp_cnode_bitmap(found_record))) { new_ba_mp->cnode_err_bitmap = bit_alloc(bg_conf->mp_cnode_cnt); new_ba_mp->cnode_usable_bitmap = bit_copy(new_ba_mp->cnode_bitmap); } list_append(found_record->ba_mp_list, new_ba_mp); found_record->mp_count = 1; found_record->mp_str = xstrdup_printf( "%s%s", bg_conf->slurm_node_prefix, new_ba_mp->coord_str); process_nodes(found_record, false); /* Force small blocks to always be non-full system blocks. * This really only plays a part on sub-midplane systems. */ found_record->full_block = 0; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("made small block of %s[%s]", found_record->mp_str, found_record->ionode_str); return found_record; }
/* * route_p_split_hostlist - logic to split an input hostlist into * a set of hostlists to forward to. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return; * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_p_split_hostlist(hostlist_t hl, hostlist_t** sp_hl, int* count) { int i, j, k, hl_ndx, msg_count, sw_count, lst_count; char *buf; bitstr_t *nodes_bitmap = NULL; /* nodes in message list */ bitstr_t *fwd_bitmap = NULL; /* nodes in forward list */ msg_count = hostlist_count(hl); if (switch_record_cnt == 0) { /* configs have not already been processed */ slurm_conf_init(NULL); if (init_node_conf()) { fatal("ROUTE: Failed to init slurm config"); } if (build_all_nodeline_info(false)) { fatal("ROUTE: Failed to build node config"); } rehash_node(); if (slurm_topo_build_config() != SLURM_SUCCESS) { fatal("ROUTE: Failed to build topology config"); } } *sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t)); /* create bitmap of nodes to send message too */ if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) { buf = hostlist_ranged_string_xmalloc(hl); fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf); } /* Find lowest level switch containing all the nodes in the list */ j = 0; for (i = 0; i <= switch_levels; i++) { for (j=0; j<switch_record_cnt; j++) { if (switch_record_table[j].level == i) { if (bit_super_set(nodes_bitmap, switch_record_table[j]. node_bitmap)) { /* All nodes in message list are in * this switch */ break; } } } if (j < switch_record_cnt) { /* Got here via break after bit_super_set */ break; // 'j' is our switch } /* else, no switches at this level reach all nodes */ } if (i > switch_levels) { /* This can only happen if trying to schedule multiple physical * clusters as a single logical cluster under the control of a * single slurmctld daemon, and sending something like a * node_registation request to all nodes. * Revert to default behavior*/ if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc(hl); debug("ROUTE: didn't find switch containing nodes=%s", buf); xfree(buf); } FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth(hl, sp_hl, count); } if (switch_record_table[j].level == 0) { /* This is a leaf switch. Construct list based on TreeWidth */ FREE_NULL_BITMAP(nodes_bitmap); xfree(*sp_hl); return route_split_hostlist_treewidth(hl, sp_hl, count); } /* loop through children, construction a hostlist for each child switch * with nodes in the message list */ hl_ndx = 0; lst_count = 0; for (i=0; i < switch_record_table[j].num_switches; i++) { k = switch_record_table[j].switch_index[i]; fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap); bit_and(fwd_bitmap, nodes_bitmap); sw_count = bit_set_count(fwd_bitmap); if (sw_count == 0) { continue; /* no nodes on this switch in message list */ } (*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap); /* Now remove nodes from this switch from message list */ bit_not(fwd_bitmap); bit_and(nodes_bitmap, fwd_bitmap); FREE_NULL_BITMAP(fwd_bitmap); if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]); debug("ROUTE: ... sublist[%d] switch=%s :: %s", i, switch_record_table[i].name, buf); xfree(buf); } hl_ndx++; lst_count += sw_count; if (lst_count == msg_count) break; /* all nodes in message are in a child list */ } FREE_NULL_BITMAP(nodes_bitmap); *count = hl_ndx; return SLURM_SUCCESS; }
int main(int argc, char *argv[]) { note("Testing static decl"); { bitstr_t bit_decl(bs, 65); /*bitstr_t *bsp = bs;*/ bit_set(bs,9); bit_set(bs,14); TEST(bit_test(bs,9), "bit 9 set"); TEST(!bit_test(bs,12), "bit 12 not set"); TEST(bit_test(bs,14), "bit 14 set" ); /*bit_free(bsp);*/ /* triggers TEST in bit_free - OK */ } note("Testing basic vixie functions"); { bitstr_t *bs = bit_alloc(16), *bs2; /*bit_set(bs, 42);*/ /* triggers TEST in bit_set - OK */ bit_set(bs,9); bit_set(bs,14); TEST(bit_test(bs,9), "bit 9 set"); TEST(!bit_test(bs,12), "bit 12 not set" ); TEST(bit_test(bs,14), "bit 14 set"); bs2 = bit_copy(bs); bit_fill_gaps(bs2); TEST(bit_ffs(bs2) == 9, "first bit set = 9 "); TEST(bit_fls(bs2) == 14, "last bit set = 14"); TEST(bit_set_count(bs2) == 6, "bitstring"); TEST(bit_test(bs2,12), "bitstring"); TEST(bit_super_set(bs,bs2) == 1, "bitstring"); TEST(bit_super_set(bs2,bs) == 0, "bitstring"); bit_clear(bs,14); TEST(!bit_test(bs,14), "bitstring"); bit_nclear(bs,9,14); TEST(!bit_test(bs,9), "bitstring"); TEST(!bit_test(bs,12), "bitstring"); TEST(!bit_test(bs,14), "bitstring"); bit_nset(bs,9,14); TEST(bit_test(bs,9), "bitstring"); TEST(bit_test(bs,12), "bitstring"); TEST(bit_test(bs,14), "bitstring"); TEST(bit_ffs(bs) == 9, "ffs"); TEST(bit_ffc(bs) == 0, "ffc"); bit_nset(bs,0,8); TEST(bit_ffc(bs) == 15, "ffc"); bit_free(bs); /*bit_set(bs,9); */ /* triggers TEST in bit_set - OK */ } note("Testing and/or/not"); { bitstr_t *bs1 = bit_alloc(128); bitstr_t *bs2 = bit_alloc(128); bit_set(bs1, 100); bit_set(bs1, 104); bit_set(bs2, 100); bit_and(bs1, bs2); TEST(bit_test(bs1, 100), "and"); TEST(!bit_test(bs1, 104), "and"); bit_set(bs2, 110); bit_set(bs2, 111); bit_set(bs2, 112); bit_or(bs1, bs2); TEST(bit_test(bs1, 100), "or"); TEST(bit_test(bs1, 110), "or"); TEST(bit_test(bs1, 111), "or"); TEST(bit_test(bs1, 112), "or"); bit_not(bs1); TEST(!bit_test(bs1, 100), "not"); TEST(bit_test(bs1, 12), "not"); bit_free(bs1); bit_free(bs2); } note("testing bit selection"); { bitstr_t *bs1 = bit_alloc(128), *bs2; bit_set(bs1, 21); bit_set(bs1, 100); bit_fill_gaps(bs1); bs2 = bit_pick_cnt(bs1,20); if (bs2) { TEST(bit_set_count(bs2) == 20, "pick"); TEST(bit_ffs(bs2) == 21, "pick"); TEST(bit_fls(bs2) == 40, "pick"); bit_free(bs2); } else TEST(0, "alloc fail"); bit_free(bs1); } note("Testing realloc"); { bitstr_t *bs = bit_alloc(1); TEST(bit_ffs(bs) == -1, "bitstring"); bit_set(bs,0); /*bit_set(bs, 1000);*/ /* triggers TEST in bit_set - OK */ bs = bit_realloc(bs,1048576); bit_set(bs,1000); bit_set(bs,1048575); TEST(bit_test(bs, 0), "bitstring"); TEST(bit_test(bs, 1000), "bitstring"); TEST(bit_test(bs, 1048575), "bitstring"); TEST(bit_set_count(bs) == 3, "bitstring"); bit_clear(bs,0); bit_clear(bs,1000); TEST(bit_set_count(bs) == 1, "bitstring"); TEST(bit_ffs(bs) == 1048575, "bitstring"); bit_free(bs); } note("Testing bit_fmt"); { char tmpstr[1024]; bitstr_t *bs = bit_alloc(1024); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), ""), "bitstring"); bit_set(bs,42); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42"), "bitstring"); bit_set(bs,102); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42,102"), "bitstring"); bit_nset(bs,9,14); TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr), bs), "9-14,42,102"), "bitstring"); } note("Testing bit_nffc/bit_nffs"); { bitstr_t *bs = bit_alloc(1024); bit_set(bs, 2); bit_set(bs, 6); bit_set(bs, 7); bit_nset(bs,12,1018); TEST(bit_nffc(bs, 2) == 0, "bitstring"); TEST(bit_nffc(bs, 3) == 3, "bitstring"); TEST(bit_nffc(bs, 4) == 8, "bitstring"); TEST(bit_nffc(bs, 5) == 1019, "bitstring"); TEST(bit_nffc(bs, 6) == -1, "bitstring"); TEST(bit_nffs(bs, 1) == 2, "bitstring"); TEST(bit_nffs(bs, 2) == 6, "bitstring"); TEST(bit_nffs(bs, 100) == 12, "bitstring"); TEST(bit_nffs(bs, 1023) == -1, "bitstring"); bit_free(bs); } note("Testing bit_unfmt"); { bitstr_t *bs = bit_alloc(1024); bitstr_t *bs2 = bit_alloc(1024); char tmpstr[4096]; bit_set(bs,1); bit_set(bs,3); bit_set(bs,30); bit_nset(bs,42,64); bit_nset(bs,97,1000); bit_fmt(tmpstr, sizeof(tmpstr), bs); TEST(bit_unfmt(bs2, tmpstr) != -1, "bitstring"); TEST(bit_equal(bs, bs2), "bitstring"); } totals(); return failed; }
extern bg_record_t *create_small_record(bg_record_t *bg_record, bitstr_t *ionodes, int size) { bg_record_t *found_record = NULL; ba_mp_t *new_ba_mp = NULL; ba_mp_t *ba_mp = NULL; char bitstring[BITSIZE]; found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t)); found_record->magic = BLOCK_MAGIC; found_record->job_running = NO_JOB_RUNNING; found_record->user_name = xstrdup(bg_record->user_name); found_record->user_uid = bg_record->user_uid; found_record->ba_mp_list = list_create(destroy_ba_mp); if (bg_record->ba_mp_list) ba_mp = list_peek(bg_record->ba_mp_list); if (!ba_mp) { if (bg_record->mp_str) { hostlist_t hl = hostlist_create(bg_record->mp_str); char *host = hostlist_shift(hl); hostlist_destroy(hl); found_record->mp_str = xstrdup(host); free(host); error("you gave me a list with no ba_mps using %s", found_record->mp_str); } else { char tmp_char[SYSTEM_DIMENSIONS+1]; int dim; for (dim=0; dim<SYSTEM_DIMENSIONS; dim++) tmp_char[dim] = alpha_num[found_record->start[dim]]; tmp_char[dim] = '\0'; found_record->mp_str = xstrdup_printf( "%s%s", bg_conf->slurm_node_prefix, tmp_char); error("you gave me a record with no ba_mps " "and no nodes either using %s", found_record->mp_str); } } else { new_ba_mp = ba_copy_mp(ba_mp); /* We need to have this node wrapped in Q to handle wires correctly when creating around the midplane. */ ba_setup_mp(new_ba_mp, false, true); new_ba_mp->used = BA_MP_USED_TRUE; list_append(found_record->ba_mp_list, new_ba_mp); found_record->mp_count = 1; found_record->mp_str = xstrdup_printf( "%s%s", bg_conf->slurm_node_prefix, new_ba_mp->coord_str); } #ifdef HAVE_BGL found_record->node_use = SELECT_COPROCESSOR_MODE; found_record->blrtsimage = xstrdup(bg_record->blrtsimage); #endif #ifdef HAVE_BG_L_P found_record->linuximage = xstrdup(bg_record->linuximage); found_record->ramdiskimage = xstrdup(bg_record->ramdiskimage); #endif found_record->mloaderimage = xstrdup(bg_record->mloaderimage); process_nodes(found_record, false); found_record->conn_type[0] = SELECT_SMALL; xassert(bg_conf->cpu_ratio); found_record->cpu_cnt = bg_conf->cpu_ratio * size; found_record->cnode_cnt = size; found_record->ionode_bitmap = bit_copy(ionodes); bit_fmt(bitstring, BITSIZE, found_record->ionode_bitmap); found_record->ionode_str = xstrdup(bitstring); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("made small block of %s[%s]", found_record->mp_str, found_record->ionode_str); return found_record; }
/* Initialize power_save module parameters. * Return 0 on valid configuration to run power saving, * otherwise log the problem and return -1 */ static int _init_power_config(void) { slurm_ctl_conf_t *conf = slurm_conf_lock(); last_config = slurmctld_conf.last_update; idle_time = conf->suspend_time - 1; suspend_rate = conf->suspend_rate; resume_timeout = conf->resume_timeout; resume_rate = conf->resume_rate; slurmd_timeout = conf->slurmd_timeout; suspend_timeout = conf->suspend_timeout; _clear_power_config(); if (conf->suspend_program) suspend_prog = xstrdup(conf->suspend_program); if (conf->resume_program) resume_prog = xstrdup(conf->resume_program); if (conf->suspend_exc_nodes) exc_nodes = xstrdup(conf->suspend_exc_nodes); if (conf->suspend_exc_parts) exc_parts = xstrdup(conf->suspend_exc_parts); slurm_conf_unlock(); if (idle_time < 0) { /* not an error */ debug("power_save module disabled, SuspendTime < 0"); return -1; } if (suspend_rate < 0) { error("power_save module disabled, SuspendRate < 0"); return -1; } if (resume_rate < 0) { error("power_save module disabled, ResumeRate < 0"); return -1; } if (suspend_prog == NULL) { error("power_save module disabled, NULL SuspendProgram"); return -1; } else if (!_valid_prog(suspend_prog)) { error("power_save module disabled, invalid SuspendProgram %s", suspend_prog); return -1; } if (resume_prog == NULL) { error("power_save module disabled, NULL ResumeProgram"); return -1; } else if (!_valid_prog(resume_prog)) { error("power_save module disabled, invalid ResumeProgram %s", resume_prog); return -1; } if (exc_nodes && (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) { error("power_save module disabled, " "invalid SuspendExcNodes %s", exc_nodes); return -1; } if (exc_parts) { char *tmp = NULL, *one_part = NULL, *part_list = NULL; struct part_record *part_ptr = NULL; int rc = 0; part_list = xstrdup(exc_parts); one_part = strtok_r(part_list, ",", &tmp); while (one_part != NULL) { part_ptr = find_part_record(one_part); if (!part_ptr) { error("power_save module disabled, " "invalid SuspendExcPart %s", one_part); rc = -1; break; } if (exc_node_bitmap) bit_or(exc_node_bitmap, part_ptr->node_bitmap); else exc_node_bitmap = bit_copy(part_ptr-> node_bitmap); one_part = strtok_r(NULL, ",", &tmp); } xfree(part_list); if (rc) return rc; } if (exc_node_bitmap) { char *tmp = bitmap2node_name(exc_node_bitmap); debug("power_save module, excluded nodes %s", tmp); xfree(tmp); } return 0; }
static char * _will_run_test(uint32_t jobid, time_t start_time, char *node_list, int *err_code, char **err_msg) { struct job_record *job_ptr = NULL; struct part_record *part_ptr; bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL; char *hostlist, *reply_msg = NULL; uint32_t min_nodes, max_nodes, req_nodes; int rc; time_t start_res, orig_start_time; List preemptee_candidates; bool resv_overlap = false; debug2("wiki2: will_run job_id=%u start_time=%u node_list=%s", jobid, (uint32_t)start_time, node_list); job_ptr = find_job_record(jobid); if (job_ptr == NULL) { *err_code = -700; *err_msg = "No such job"; error("wiki: Failed to find job %u", jobid); return NULL; } if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) { *err_code = -700; *err_msg = "WillRun not applicable to non-pending job"; error("wiki: WillRun on non-pending job %u", jobid); return NULL; } part_ptr = job_ptr->part_ptr; if (part_ptr == NULL) { *err_code = -700; *err_msg = "Job lacks a partition"; error("wiki: Job %u lacks a partition", jobid); return NULL; } if ((node_list == NULL) || (node_list[0] == '\0')) { /* assume all nodes available to job for testing */ avail_bitmap = bit_copy(avail_node_bitmap); } else if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) { *err_code = -700; *err_msg = "Invalid available nodes value"; error("wiki: Attempt to set invalid available node " "list for job %u, %s", jobid, node_list); return NULL; } /* Enforce reservation: access control, time and nodes */ start_res = start_time; rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap, &exc_core_bitmap, &resv_overlap); if (rc != SLURM_SUCCESS) { *err_code = -730; *err_msg = "Job denied access to reservation"; error("wiki: reservation access denied for job %u", jobid); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } bit_and(avail_bitmap, resv_bitmap); FREE_NULL_BITMAP(resv_bitmap); /* Only consider nodes that are not DOWN or DRAINED */ bit_and(avail_bitmap, avail_node_bitmap); /* Consider only nodes in this job's partition */ if (part_ptr->node_bitmap) bit_and(avail_bitmap, part_ptr->node_bitmap); else { *err_code = -730; *err_msg = "Job's partition has no nodes"; error("wiki: no nodes in partition %s for job %u", part_ptr->name, jobid); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) { /* Job probably has invalid feature list */ *err_code = -730; *err_msg = "Job's required features not available " "on selected nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } if (job_ptr->details->exc_node_bitmap) { bit_not(job_ptr->details->exc_node_bitmap); bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap); bit_not(job_ptr->details->exc_node_bitmap); } if ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) { *err_code = -730; *err_msg = "Job's required nodes not available"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes); if (job_ptr->details->max_nodes == 0) max_nodes = part_ptr->max_nodes; else max_nodes = MIN(job_ptr->details->max_nodes, part_ptr->max_nodes); max_nodes = MIN(max_nodes, 500000); /* prevent overflows */ if (job_ptr->details->max_nodes) req_nodes = max_nodes; else req_nodes = min_nodes; if (min_nodes > max_nodes) { /* job's min_nodes exceeds partitions max_nodes */ *err_code = -730; *err_msg = "Job's min_nodes > max_nodes"; error("wiki: job %u not runnable on hosts=%s", jobid, node_list); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return NULL; } preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_start_time = job_ptr->start_time; rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); FREE_NULL_LIST(preemptee_candidates); if (rc == SLURM_SUCCESS) { char tmp_str[128]; *err_code = 0; uint32_t proc_cnt = 0; xstrcat(reply_msg, "STARTINFO="); #ifdef HAVE_BG select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &proc_cnt); #else proc_cnt = job_ptr->total_cpus; #endif snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,", jobid, proc_cnt, (uint32_t) job_ptr->start_time); xstrcat(reply_msg, tmp_str); hostlist = bitmap2node_name(avail_bitmap); xstrcat(reply_msg, hostlist); xfree(hostlist); } else { xstrcat(reply_msg, "Jobs not runable on selected nodes"); error("wiki: jobs not runnable on nodes"); } /* Restore pending job's expected start time */ job_ptr->start_time = orig_start_time; FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); return reply_msg; }
uint32_t powercap_get_node_bitmap_maxwatts_dvfs(bitstr_t *idle_bitmap, bitstr_t *select_bitmap, uint32_t *max_watts_dvfs, int* allowed_freqs, uint32_t num_cpus) { uint32_t max_watts = 0, tmp_max_watts = 0, val = 0; uint32_t *tmp_max_watts_dvfs = NULL; struct node_record *node_ptr; int i, p; char ename[128], keyname[128]; bitstr_t *tmp_bitmap = NULL; uint32_t data[5], core_data[4]; if (!_powercap_enabled()) return 0; if (max_watts_dvfs != NULL) { tmp_max_watts_dvfs = xmalloc(sizeof(uint32_t)*(allowed_freqs[0]+1)); } /* if no input bitmap, consider the current idle nodes * bitmap as the input bitmap tagging nodes to consider * as idle while computing the max watts of the cluster */ if (idle_bitmap == NULL && select_bitmap == NULL) { tmp_bitmap = bit_copy(idle_node_bitmap); idle_bitmap = tmp_bitmap; select_bitmap = tmp_bitmap; } for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (bit_test(idle_bitmap, i)) { /* idle nodes, 2 cases : power save or not */ if (bit_test(power_node_bitmap, i)) { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_SAVE, &val, L_T_UINT32); } else { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_IDLE, &val, L_T_UINT32); } } else if (bit_test(select_bitmap, i)) { layouts_entity_get_mkv(L_NAME, node_ptr->name, "IdleWatts,MaxWatts,CoresCount,LastCore,CurrentPower", data, (sizeof(uint32_t) * 5), L_T_UINT32); /* tmp_max_watts = IdleWatts - cpus*IdleCoreWatts * + cpus*MaxCoreWatts */ sprintf(ename, "virtualcore%u", data[3]); if (num_cpus == 0) num_cpus = data[2]; layouts_entity_get_mkv(L_NAME, ename, "IdleCoreWatts,MaxCoreWatts", core_data, (sizeof(uint32_t) * 2), L_T_UINT32); if (data[4] == 0) { tmp_max_watts += data[0] - num_cpus*core_data[0] + num_cpus*core_data[1]; } else if (data[4] > 0) { tmp_max_watts += data[4] - num_cpus*core_data[0] + num_cpus*core_data[1]; } else if (num_cpus == data[2]) tmp_max_watts += data[1]; if (!tmp_max_watts_dvfs) goto skip_dvfs; for (p = 1; p < (allowed_freqs[0] + 1); p++) { sprintf(keyname, "IdleCoreWatts,MaxCoreWatts," "Cpufreq%dWatts,CurrentCorePower", allowed_freqs[p]); layouts_entity_get_mkv(L_NAME, ename, keyname, core_data, (sizeof(uint32_t) * 4), L_T_UINT32); if (num_cpus == data[2]) { tmp_max_watts_dvfs[p] += num_cpus*core_data[2]; } else { if (data[4] == 0) { tmp_max_watts_dvfs[p] += data[0] - num_cpus*core_data[0] + num_cpus*core_data[2]; } else { tmp_max_watts_dvfs[p] += data[4] - num_cpus*core_data[0] + num_cpus*core_data[2]; } } } skip_dvfs: ; } else { /* non-idle nodes, 2 cases : down or not */ if (!bit_test(up_node_bitmap, i)) { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_DOWN, &val, L_T_UINT32); } else { layouts_entity_pullget_kv(L_NAME, node_ptr->name, L_NODE_CUR, &val, L_T_UINT32); } } max_watts += val; val = 0; } if (max_watts_dvfs) { for (p = 1; p < allowed_freqs[0] + 1; p++) { max_watts_dvfs[p] = max_watts + tmp_max_watts_dvfs[p]; } xfree(tmp_max_watts_dvfs); } max_watts += tmp_max_watts; if (tmp_bitmap) bit_free(tmp_bitmap); return max_watts; }
static void jlink_execute_scan(struct jtag_command *cmd) { DEBUG_JTAG_IO("%s type:%d", cmd->cmd.scan->ir_scan ? "IRSCAN" : "DRSCAN", jtag_scan_type(cmd->cmd.scan)); /* Make sure there are no trailing fields with num_bits == 0, or the logic below will fail. */ while (cmd->cmd.scan->num_fields > 0 && cmd->cmd.scan->fields[cmd->cmd.scan->num_fields - 1].num_bits == 0) { cmd->cmd.scan->num_fields--; LOG_DEBUG("discarding trailing empty field"); } if (cmd->cmd.scan->num_fields == 0) { LOG_DEBUG("empty scan, doing nothing"); return; } if (cmd->cmd.scan->ir_scan) { if (tap_get_state() != TAP_IRSHIFT) { jlink_end_state(TAP_IRSHIFT); jlink_state_move(); } } else { if (tap_get_state() != TAP_DRSHIFT) { jlink_end_state(TAP_DRSHIFT); jlink_state_move(); } } jlink_end_state(cmd->cmd.scan->end_state); struct scan_field *field = cmd->cmd.scan->fields; unsigned scan_size = 0; for (int i = 0; i < cmd->cmd.scan->num_fields; i++, field++) { scan_size += field->num_bits; DEBUG_JTAG_IO("%s%s field %d/%d %d bits", field->in_value ? "in" : "", field->out_value ? "out" : "", i, cmd->cmd.scan->num_fields, field->num_bits); if (i == cmd->cmd.scan->num_fields - 1 && tap_get_state() != tap_get_end_state()) { /* Last field, and we're leaving IRSHIFT/DRSHIFT. Clock last bit during tap * movement. This last field can't have length zero, it was checked above. */ jlink_clock_data(field->out_value, 0, NULL, 0, field->in_value, 0, field->num_bits - 1); uint8_t last_bit = 0; if (field->out_value) bit_copy(&last_bit, 0, field->out_value, field->num_bits - 1, 1); uint8_t tms_bits = 0x01; jlink_clock_data(&last_bit, 0, &tms_bits, 0, field->in_value, field->num_bits - 1, 1); tap_set_state(tap_state_transition(tap_get_state(), 1)); jlink_clock_data(&last_bit, 0, &tms_bits, 1, NULL, 0, 1); tap_set_state(tap_state_transition(tap_get_state(), 0)); } else jlink_clock_data(field->out_value, 0, NULL, 0, field->in_value, 0, field->num_bits); } if (tap_get_state() != tap_get_end_state()) { jlink_end_state(tap_get_end_state()); jlink_state_move(); } DEBUG_JTAG_IO("%s scan, %i bits, end in %s", (cmd->cmd.scan->ir_scan) ? "IR" : "DR", scan_size, tap_state_name(tap_get_end_state())); }
extern job_resources_t *copy_job_resources(job_resources_t *job_resrcs_ptr) { int i, sock_inx = 0; job_resources_t *new_layout = xmalloc(sizeof(struct job_resources)); xassert(job_resrcs_ptr); new_layout->nhosts = job_resrcs_ptr->nhosts; new_layout->ncpus = job_resrcs_ptr->ncpus; new_layout->node_req = job_resrcs_ptr->node_req; if (job_resrcs_ptr->core_bitmap) { new_layout->core_bitmap = bit_copy(job_resrcs_ptr-> core_bitmap); } if (job_resrcs_ptr->core_bitmap_used) { new_layout->core_bitmap_used = bit_copy(job_resrcs_ptr-> core_bitmap_used); } if (job_resrcs_ptr->node_bitmap) { new_layout->node_bitmap = bit_copy(job_resrcs_ptr-> node_bitmap); } new_layout->cpu_array_cnt = job_resrcs_ptr->cpu_array_cnt; if (job_resrcs_ptr->cpu_array_reps && job_resrcs_ptr->cpu_array_cnt) { new_layout->cpu_array_reps = xmalloc(sizeof(uint32_t) * job_resrcs_ptr->cpu_array_cnt); memcpy(new_layout->cpu_array_reps, job_resrcs_ptr->cpu_array_reps, (sizeof(uint32_t) * job_resrcs_ptr->cpu_array_cnt)); } if (job_resrcs_ptr->cpu_array_value && job_resrcs_ptr->cpu_array_cnt) { new_layout->cpu_array_value = xmalloc(sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt); memcpy(new_layout->cpu_array_value, job_resrcs_ptr->cpu_array_value, (sizeof(uint16_t) * job_resrcs_ptr->cpu_array_cnt)); } if (job_resrcs_ptr->cpus) { new_layout->cpus = xmalloc(sizeof(uint16_t) * job_resrcs_ptr->nhosts); memcpy(new_layout->cpus, job_resrcs_ptr->cpus, (sizeof(uint16_t) * job_resrcs_ptr->nhosts)); } if (job_resrcs_ptr->cpus_used) { new_layout->cpus_used = xmalloc(sizeof(uint16_t) * job_resrcs_ptr->nhosts); memcpy(new_layout->cpus_used, job_resrcs_ptr->cpus_used, (sizeof(uint16_t) * job_resrcs_ptr->nhosts)); } if (job_resrcs_ptr->memory_allocated) { new_layout->memory_allocated = xmalloc(sizeof(uint32_t) * new_layout->nhosts); memcpy(new_layout->memory_allocated, job_resrcs_ptr->memory_allocated, (sizeof(uint32_t) * job_resrcs_ptr->nhosts)); } if (job_resrcs_ptr->memory_used) { new_layout->memory_used = xmalloc(sizeof(uint32_t) * new_layout->nhosts); memcpy(new_layout->memory_used, job_resrcs_ptr->memory_used, (sizeof(uint32_t) * job_resrcs_ptr->nhosts)); } /* Copy sockets_per_node, cores_per_socket and core_sock_rep_count */ new_layout->sockets_per_node = xmalloc(sizeof(uint16_t) * new_layout->nhosts); new_layout->cores_per_socket = xmalloc(sizeof(uint16_t) * new_layout->nhosts); new_layout->sock_core_rep_count = xmalloc(sizeof(uint32_t) * new_layout->nhosts); for (i=0; i<new_layout->nhosts; i++) { if (job_resrcs_ptr->sock_core_rep_count[i] == 0) { error("copy_job_resources: sock_core_rep_count=0"); break; } sock_inx += job_resrcs_ptr->sock_core_rep_count[i]; if (sock_inx >= job_resrcs_ptr->nhosts) { i++; break; } } memcpy(new_layout->sockets_per_node, job_resrcs_ptr->sockets_per_node, (sizeof(uint16_t) * i)); memcpy(new_layout->cores_per_socket, job_resrcs_ptr->cores_per_socket, (sizeof(uint16_t) * i)); memcpy(new_layout->sock_core_rep_count, job_resrcs_ptr->sock_core_rep_count, (sizeof(uint32_t) * i)); return new_layout; }
/* Merge MPS records back to original list, updating and reordering as needed */ static void _merge_lists(List gres_conf_list, List gpu_conf_list, List mps_conf_list) { ListIterator gpu_itr, mps_itr; gres_slurmd_conf_t *gpu_record, *mps_record; /* * If gres/mps has Count, but no File specification and there is more * than one gres/gpu record, then evenly distribute gres/mps Count * evenly over all gres/gpu file records */ if ((list_count(mps_conf_list) == 1) && (list_count(gpu_conf_list) > 1)) { mps_record = list_peek(mps_conf_list); if (!mps_record->file) { _distribute_count(gres_conf_list, gpu_conf_list, mps_record->count); list_flush(mps_conf_list); return; } } /* Add MPS records, matching File ordering to that of GPU records */ gpu_itr = list_iterator_create(gpu_conf_list); while ((gpu_record = list_next(gpu_itr))) { mps_itr = list_iterator_create(mps_conf_list); while ((mps_record = list_next(mps_itr))) { if (!xstrcmp(gpu_record->file, mps_record->file)) { /* Copy gres/gpu Type & CPU info to gres/mps */ if (gpu_record->type_name) { mps_record->config_flags |= GRES_CONF_HAS_TYPE; } if (gpu_record->cpus) { xfree(mps_record->cpus); mps_record->cpus = xstrdup(gpu_record->cpus); } if (gpu_record->cpus_bitmap) { mps_record->cpu_cnt = gpu_record->cpu_cnt; FREE_NULL_BITMAP( mps_record->cpus_bitmap); mps_record->cpus_bitmap = bit_copy(gpu_record->cpus_bitmap); } xfree(mps_record->type_name); mps_record->type_name = xstrdup(gpu_record->type_name); list_append(gres_conf_list, mps_record); (void) list_remove(mps_itr); break; } } list_iterator_destroy(mps_itr); if (!mps_record) { /* Add gres/mps record to match gres/gps record */ mps_record = xmalloc(sizeof(gres_slurmd_conf_t)); mps_record->config_flags = gpu_record->config_flags; mps_record->count = 0; mps_record->cpu_cnt = gpu_record->cpu_cnt; mps_record->cpus = xstrdup(gpu_record->cpus); if (gpu_record->cpus_bitmap) { mps_record->cpus_bitmap = bit_copy(gpu_record->cpus_bitmap); } mps_record->file = xstrdup(gpu_record->file); mps_record->name = xstrdup("mps"); mps_record->plugin_id = gres_plugin_build_id("mps"); mps_record->type_name = xstrdup(gpu_record->type_name); list_append(gres_conf_list, mps_record); } list_append(gres_conf_list, gpu_record); (void) list_remove(gpu_itr); } list_iterator_destroy(gpu_itr); /* Remove any remaining MPS records (no matching File) */ mps_itr = list_iterator_create(mps_conf_list); while ((mps_record = list_next(mps_itr))) { error("%s: Discarding gres/mps configuration (File=%s) without matching gres/gpu record", plugin_name, mps_record->file); (void) list_delete_item(mps_itr); } list_iterator_destroy(mps_itr); }