/* * assign_front_end - assign a front end node for starting a job * job_ptr IN - job to assign a front end node (tests access control lists) * RET pointer to the front end node to use or NULL if none found */ extern front_end_record_t *assign_front_end(struct job_record *job_ptr) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr, *best_front_end = NULL; uint32_t state_flags; int i; if (!job_ptr->batch_host && (job_ptr->batch_flag == 0) && (front_end_ptr = find_front_end_record(job_ptr->alloc_node))) { /* Use submit host for interactive job */ if (!IS_NODE_DOWN(front_end_ptr) && !IS_NODE_DRAIN(front_end_ptr) && !IS_NODE_NO_RESPOND(front_end_ptr) && _front_end_access(front_end_ptr, job_ptr)) { best_front_end = front_end_ptr; } else { info("%s: front-end node %s not available for job %u", __func__, job_ptr->alloc_node, job_ptr->job_id); return NULL; } } else { for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (job_ptr->batch_host) { /* Find specific front-end */ if (xstrcmp(job_ptr->batch_host, front_end_ptr->name)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) break; } else { /* Find a usable front-end node */ if (IS_NODE_DOWN(front_end_ptr) || IS_NODE_DRAIN(front_end_ptr) || IS_NODE_NO_RESPOND(front_end_ptr)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) continue; } if ((best_front_end == NULL) || (front_end_ptr->job_cnt_run < best_front_end->job_cnt_run)) best_front_end = front_end_ptr; } } if (best_front_end) { state_flags = best_front_end->node_state & NODE_STATE_FLAGS; best_front_end->node_state = NODE_STATE_ALLOCATED | state_flags; best_front_end->job_cnt_run++; return best_front_end; } else if (job_ptr->batch_host) { /* Find specific front-end node */ error("assign_front_end: front end node %s not found", job_ptr->batch_host); } else { /* Find some usable front-end node */ error("assign_front_end: no available front end nodes found"); } #endif return NULL; }
static char * _get_node_state(struct node_record *node_ptr) { static bool got_select_type = false; static bool node_allocations; if (!got_select_type) { char * select_type = slurm_get_select_type(); if (select_type && (strcasecmp(select_type, "select/linear") == 0)) node_allocations = true; else node_allocations = false; xfree(select_type); got_select_type = true; } if (IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr)) return "Draining"; if (IS_NODE_COMPLETING(node_ptr)) return "Busy"; if (IS_NODE_DOWN(node_ptr)) return "Down"; if (IS_NODE_ALLOCATED(node_ptr)) { if (node_allocations) return "Busy"; else return "Running"; } if (IS_NODE_IDLE(node_ptr)) return "Idle"; return "Unknown"; }
/* * assign_front_end - assign a front end node for starting a job * RET pointer to the front end node to use or NULL if none available */ extern front_end_record_t *assign_front_end(void) { #ifdef HAVE_FRONT_END static int last_assigned = -1; front_end_record_t *front_end_ptr; uint16_t state_flags; int i; for (i = 0; i < front_end_node_cnt; i++) { last_assigned = (last_assigned + 1) % front_end_node_cnt; front_end_ptr = front_end_nodes + last_assigned; if (IS_NODE_DOWN(front_end_ptr) || IS_NODE_DRAIN(front_end_ptr) || IS_NODE_NO_RESPOND(front_end_ptr)) continue; state_flags = front_end_nodes[last_assigned].node_state & NODE_STATE_FLAGS; front_end_nodes[last_assigned].node_state = NODE_STATE_ALLOCATED | state_flags; front_end_nodes[last_assigned].job_cnt_run++; return front_end_ptr; } fatal("assign_front_end: no available front end nodes found"); #endif return NULL; }
/* Determine if specific slurm node is already in DOWN or DRAIN state */ extern int node_already_down(char *node_name) { struct node_record *node_ptr = find_node_record(node_name); if (node_ptr) { if (IS_NODE_DRAIN(node_ptr)) return 2; else if (IS_NODE_DOWN(node_ptr)) return 1; else return 0; } return 0; }
/* * avail_front_end - test if any front end nodes are available for starting job */ extern bool avail_front_end(void) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; int i; for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (IS_NODE_DOWN(front_end_ptr) || IS_NODE_DRAIN(front_end_ptr) || IS_NODE_NO_RESPOND(front_end_ptr)) continue; return true; } return false; #else return true; #endif }
/* * assign_front_end - assign a front end node for starting a job * job_ptr IN - job to assign a front end node (tests access control lists) * RET pointer to the front end node to use or NULL if none found */ extern front_end_record_t *assign_front_end(struct job_record *job_ptr) { #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr, *best_front_end = NULL; uint32_t state_flags; int i; for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if (job_ptr->batch_host) { /* Find specific front-end node */ if (strcmp(job_ptr->batch_host, front_end_ptr->name)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) break; } else { /* Find some usable front-end node */ if (IS_NODE_DOWN(front_end_ptr) || IS_NODE_DRAIN(front_end_ptr) || IS_NODE_NO_RESPOND(front_end_ptr)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) continue; } if ((best_front_end == NULL) || (front_end_ptr->job_cnt_run < best_front_end->job_cnt_run)) best_front_end = front_end_ptr; } if (best_front_end) { state_flags = best_front_end->node_state & NODE_STATE_FLAGS; best_front_end->node_state = NODE_STATE_ALLOCATED | state_flags; best_front_end->job_cnt_run++; return best_front_end; } else if (job_ptr->batch_host) { /* Find specific front-end node */ error("assign_front_end: front end node %s not found", job_ptr->batch_host); } else { /* Find some usable front-end node */ error("assign_front_end: no available front end nodes found"); } #endif return NULL; }
/* * assign_front_end - assign a front end node for starting a job * job_ptr IN - job to assign a front end node (tests access control lists) * RET pointer to the front end node to use or NULL if none found */ extern front_end_record_t *assign_front_end(struct job_record *job_ptr) { #ifdef HAVE_FRONT_END static int last_assigned = -1; front_end_record_t *front_end_ptr; uint16_t state_flags; int i; for (i = 0; i < front_end_node_cnt; i++) { last_assigned = (last_assigned + 1) % front_end_node_cnt; front_end_ptr = front_end_nodes + last_assigned; if (job_ptr->batch_host) { /* Find specific front-end node */ if (strcmp(job_ptr->batch_host, front_end_ptr->name)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) break; } else { /* Find some usable front-end node */ if (IS_NODE_DOWN(front_end_ptr) || IS_NODE_DRAIN(front_end_ptr) || IS_NODE_NO_RESPOND(front_end_ptr)) continue; if (!_front_end_access(front_end_ptr, job_ptr)) continue; } state_flags = front_end_nodes[last_assigned].node_state & NODE_STATE_FLAGS; front_end_nodes[last_assigned].node_state = NODE_STATE_ALLOCATED | state_flags; front_end_nodes[last_assigned].job_cnt_run++; return front_end_ptr; } if (job_ptr->batch_host) { /* Find specific front-end node */ error("assign_front_end: front end node %s not found", job_ptr->batch_host); } else { /* Find some usable front-end node */ error("assign_front_end: no available front end nodes found"); } #endif return NULL; }
/* Perform any power change work to nodes */ static void _do_power_work(time_t now) { static time_t last_log = 0, last_work_scan = 0; int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0; time_t delta_t; uint32_t susp_state; bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL; struct node_record *node_ptr; bool run_suspend = false; /* Set limit on counts of nodes to have state changed */ delta_t = now - last_work_scan; if (delta_t >= 60) { suspend_cnt_f = 0.0; resume_cnt_f = 0.0; } else { float rate = (60 - delta_t) / 60.0; suspend_cnt_f *= rate; resume_cnt_f *= rate; } suspend_cnt = (suspend_cnt_f + 0.5); resume_cnt = (resume_cnt_f + 0.5); if (now > (last_suspend + suspend_timeout)) { /* ready to start another round of node suspends */ run_suspend = true; if (last_suspend) { bit_nclear(suspend_node_bitmap, 0, (node_record_count - 1)); bit_nclear(resume_node_bitmap, 0, (node_record_count - 1)); last_suspend = (time_t) 0; } } last_work_scan = now; /* Build bitmaps identifying each node which should change state */ for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { susp_state = IS_NODE_POWER_SAVE(node_ptr); if (susp_state) susp_total++; /* Resume nodes as appropriate */ if (susp_state && ((resume_rate == 0) || (resume_cnt < resume_rate)) && (bit_test(suspend_node_bitmap, i) == 0) && (IS_NODE_ALLOCATED(node_ptr) || (node_ptr->last_idle > (now - idle_time)))) { if (wake_node_bitmap == NULL) { wake_node_bitmap = bit_alloc(node_record_count); } wake_cnt++; resume_cnt++; resume_cnt_f++; node_ptr->node_state &= (~NODE_STATE_POWER_SAVE); node_ptr->node_state |= NODE_STATE_POWER_UP; node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(power_node_bitmap, i); bit_clear(avail_node_bitmap, i); node_ptr->last_response = now + resume_timeout; bit_set(wake_node_bitmap, i); bit_set(resume_node_bitmap, i); } /* Suspend nodes as appropriate */ if (run_suspend && (susp_state == 0) && ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) && (IS_NODE_IDLE(node_ptr) || IS_NODE_DOWN(node_ptr)) && (node_ptr->sus_job_cnt == 0) && (!IS_NODE_COMPLETING(node_ptr)) && (!IS_NODE_POWER_UP(node_ptr)) && (node_ptr->last_idle != 0) && (node_ptr->last_idle < (now - idle_time)) && ((exc_node_bitmap == NULL) || (bit_test(exc_node_bitmap, i) == 0))) { if (sleep_node_bitmap == NULL) { sleep_node_bitmap = bit_alloc(node_record_count); } sleep_cnt++; suspend_cnt++; suspend_cnt_f++; node_ptr->node_state |= NODE_STATE_POWER_SAVE; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DOWN(node_ptr) && !IS_NODE_DRAIN(node_ptr)) bit_set(avail_node_bitmap, i); bit_set(power_node_bitmap, i); bit_set(sleep_node_bitmap, i); bit_set(suspend_node_bitmap, i); last_suspend = now; } } if (((now - last_log) > 600) && (susp_total > 0)) { info("Power save mode: %d nodes", susp_total); last_log = now; } if (sleep_node_bitmap) { char *nodes; nodes = bitmap2node_name(sleep_node_bitmap); if (nodes) _do_suspend(nodes); else error("power_save: bitmap2nodename"); xfree(nodes); FREE_NULL_BITMAP(sleep_node_bitmap); /* last_node_update could be changed already by another thread! last_node_update = now; */ } if (wake_node_bitmap) { char *nodes; nodes = bitmap2node_name(wake_node_bitmap); if (nodes) _do_resume(nodes); else error("power_save: bitmap2nodename"); xfree(nodes); FREE_NULL_BITMAP(wake_node_bitmap); /* last_node_update could be changed already by another thread! last_node_update = now; */ } }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; int rel_rc; time_t now = time(NULL); static time_t slurm_alps_mismatch_time = (time_t) 0; static bool logged_sync_timeout = false; static time_t last_inv_run = 0; if ((now - last_inv_run) < inv_interval) return SLURM_SUCCESS; last_inv_run = now; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); /* Avoid checking for inv->batch_avail here since if we are gang scheduling returning an error for a full system is probably the wrong thing to do. (the schedule() function in the slurmctld will never run ;)). */ if (!inv->f->node_head || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { int node_inx; struct node_record *node_ptr; char *reason = NULL; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } node_inx = node_ptr - node_record_table_ptr; if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ } else if ((slurmctld_conf.slurmd_timeout == 0) || ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(avail_node_bitmap, node_inx); } else { xfree(node_ptr->reason); info("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down_ptr(node_ptr, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); node_ptr->down_time = 0; info("MARKING %s UP", node_ptr->name); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { xfree(node_ptr->reason); node_ptr->reason_time = 0; node_ptr->reason_uid = NO_VAL; clusteracct_storage_g_node_up( acct_db_conn, node_ptr, now); } } else if (IS_NODE_NO_RESPOND(node_ptr)) { node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { bit_set(avail_node_bitmap, node_inx); } } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); /* * Changed to ignore reservations for "UNKNOWN" batch * ids (e.g. the interactive region) (Chris North) */ if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); rel_rc = basil_safe_release(rsvn->rsvn_id, inv); if (rel_rc) { error("ALPS reservation %u removal FAILED: %s", rsvn->rsvn_id, basil_strerror(rel_rc)); } else { debug("ALPS reservation %u removed", rsvn->rsvn_id); } slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) { /* If SLURM and ALPS state are not in synchronization, * do not schedule any more jobs until waiting at least * SyncTimeout seconds. */ if (slurm_alps_mismatch_time == 0) { slurm_alps_mismatch_time = now; } else if (cray_conf->sync_timeout == 0) { /* Wait indefinitely */ } else if (difftime(now, slurm_alps_mismatch_time) < cray_conf->sync_timeout) { return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } else if (!logged_sync_timeout) { error("Could not synchronize SLURM with ALPS for %u " "seconds, proceeding with job scheduling", cray_conf->sync_timeout); logged_sync_timeout = true; } } else { slurm_alps_mismatch_time = 0; logged_sync_timeout = false; } return rc; }
static void _update_sinfo(sinfo_data_t *sinfo_ptr, node_info_t *node_ptr, uint32_t node_scaling) { uint16_t base_state; uint16_t used_cpus = 0, error_cpus = 0; int total_cpus = 0, total_nodes = 0; /* since node_scaling could be less here, we need to use the * global node scaling which should never change. */ int single_node_cpus = (node_ptr->cpus / g_node_scaling); base_state = node_ptr->node_state & NODE_STATE_BASE; if (sinfo_ptr->nodes_total == 0) { /* first node added */ sinfo_ptr->node_state = node_ptr->node_state; sinfo_ptr->features = node_ptr->features; sinfo_ptr->gres = node_ptr->gres; sinfo_ptr->reason = node_ptr->reason; sinfo_ptr->reason_time= node_ptr->reason_time; sinfo_ptr->reason_uid = node_ptr->reason_uid; sinfo_ptr->min_cpus = node_ptr->cpus; sinfo_ptr->max_cpus = node_ptr->cpus; sinfo_ptr->min_sockets = node_ptr->sockets; sinfo_ptr->max_sockets = node_ptr->sockets; sinfo_ptr->min_cores = node_ptr->cores; sinfo_ptr->max_cores = node_ptr->cores; sinfo_ptr->min_threads = node_ptr->threads; sinfo_ptr->max_threads = node_ptr->threads; sinfo_ptr->min_disk = node_ptr->tmp_disk; sinfo_ptr->max_disk = node_ptr->tmp_disk; sinfo_ptr->min_mem = node_ptr->real_memory; sinfo_ptr->max_mem = node_ptr->real_memory; sinfo_ptr->min_weight = node_ptr->weight; sinfo_ptr->max_weight = node_ptr->weight; sinfo_ptr->min_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpu_load = node_ptr->cpu_load; sinfo_ptr->max_cpus_per_node = sinfo_ptr->part_info-> max_cpus_per_node; sinfo_ptr->version = node_ptr->version; } else if (hostlist_find(sinfo_ptr->nodes, node_ptr->name) != -1) { /* we already have this node in this record, * just return, don't duplicate */ return; } else { if (sinfo_ptr->min_cpus > node_ptr->cpus) sinfo_ptr->min_cpus = node_ptr->cpus; if (sinfo_ptr->max_cpus < node_ptr->cpus) sinfo_ptr->max_cpus = node_ptr->cpus; if (sinfo_ptr->min_sockets > node_ptr->sockets) sinfo_ptr->min_sockets = node_ptr->sockets; if (sinfo_ptr->max_sockets < node_ptr->sockets) sinfo_ptr->max_sockets = node_ptr->sockets; if (sinfo_ptr->min_cores > node_ptr->cores) sinfo_ptr->min_cores = node_ptr->cores; if (sinfo_ptr->max_cores < node_ptr->cores) sinfo_ptr->max_cores = node_ptr->cores; if (sinfo_ptr->min_threads > node_ptr->threads) sinfo_ptr->min_threads = node_ptr->threads; if (sinfo_ptr->max_threads < node_ptr->threads) sinfo_ptr->max_threads = node_ptr->threads; if (sinfo_ptr->min_disk > node_ptr->tmp_disk) sinfo_ptr->min_disk = node_ptr->tmp_disk; if (sinfo_ptr->max_disk < node_ptr->tmp_disk) sinfo_ptr->max_disk = node_ptr->tmp_disk; if (sinfo_ptr->min_mem > node_ptr->real_memory) sinfo_ptr->min_mem = node_ptr->real_memory; if (sinfo_ptr->max_mem < node_ptr->real_memory) sinfo_ptr->max_mem = node_ptr->real_memory; if (sinfo_ptr->min_weight> node_ptr->weight) sinfo_ptr->min_weight = node_ptr->weight; if (sinfo_ptr->max_weight < node_ptr->weight) sinfo_ptr->max_weight = node_ptr->weight; if (sinfo_ptr->min_cpu_load > node_ptr->cpu_load) sinfo_ptr->min_cpu_load = node_ptr->cpu_load; if (sinfo_ptr->max_cpu_load < node_ptr->cpu_load) sinfo_ptr->max_cpu_load = node_ptr->cpu_load; } hostlist_push_host(sinfo_ptr->nodes, node_ptr->name); if (params.match_flags.node_addr_flag) hostlist_push_host(sinfo_ptr->node_addr, node_ptr->node_addr); if (params.match_flags.hostnames_flag) hostlist_push_host(sinfo_ptr->hostnames, node_ptr->node_hostname); total_cpus = node_ptr->cpus; total_nodes = node_scaling; select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &used_cpus); select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &error_cpus); if (params.cluster_flags & CLUSTER_FLAG_BG) { if (!params.match_flags.state_flag && (used_cpus || error_cpus)) { /* We only get one shot at this (because all states * are combined together), so we need to make * sure we get all the subgrps accounted. (So use * g_node_scaling for safe measure) */ total_nodes = g_node_scaling; sinfo_ptr->nodes_alloc += used_cpus; sinfo_ptr->nodes_other += error_cpus; sinfo_ptr->nodes_idle += (total_nodes - (used_cpus + error_cpus)); used_cpus *= single_node_cpus; error_cpus *= single_node_cpus; } else { /* process only for this subgrp and then return */ total_cpus = total_nodes * single_node_cpus; if ((base_state == NODE_STATE_ALLOCATED) || (base_state == NODE_STATE_MIXED) || (node_ptr->node_state & NODE_STATE_COMPLETING)) { sinfo_ptr->nodes_alloc += total_nodes; sinfo_ptr->cpus_alloc += total_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->nodes_other += total_nodes; sinfo_ptr->cpus_other += total_cpus; } else { sinfo_ptr->nodes_idle += total_nodes; sinfo_ptr->cpus_idle += total_cpus; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_total += total_cpus; return; } } else { if ((base_state == NODE_STATE_ALLOCATED) || (base_state == NODE_STATE_MIXED) || IS_NODE_COMPLETING(node_ptr)) sinfo_ptr->nodes_alloc += total_nodes; else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) sinfo_ptr->nodes_other += total_nodes; else sinfo_ptr->nodes_idle += total_nodes; } sinfo_ptr->nodes_total += total_nodes; sinfo_ptr->cpus_alloc += used_cpus; sinfo_ptr->cpus_total += total_cpus; total_cpus -= used_cpus + error_cpus; if (error_cpus) { sinfo_ptr->cpus_idle += total_cpus; sinfo_ptr->cpus_other += error_cpus; } else if (IS_NODE_DRAIN(node_ptr) || (base_state == NODE_STATE_DOWN)) { sinfo_ptr->cpus_other += total_cpus; } else sinfo_ptr->cpus_idle += total_cpus; }
/* * _filter_out - Determine if the specified node should be filtered out or * reported. * node_ptr IN - node to consider filtering out * RET - true if node should not be reported, false otherwise */ static bool _filter_out(node_info_t *node_ptr) { static hostlist_t host_list = NULL; if (params.nodes) { if (host_list == NULL) host_list = hostlist_create(params.nodes); if (hostlist_find (host_list, node_ptr->name) == -1) return true; } if (params.dead_nodes && !IS_NODE_NO_RESPOND(node_ptr)) return true; if (params.responding_nodes && IS_NODE_NO_RESPOND(node_ptr)) return true; if (params.state_list) { int *node_state; bool match = false; uint16_t base_state; ListIterator iterator; uint16_t cpus = 0; node_info_t tmp_node, *tmp_node_ptr = &tmp_node; iterator = list_iterator_create(params.state_list); while ((node_state = list_next(iterator))) { tmp_node_ptr->node_state = *node_state; if (*node_state == NODE_STATE_DRAIN) { /* We search for anything that has the * drain flag set */ if (IS_NODE_DRAIN(node_ptr)) { match = true; break; } } else if (IS_NODE_DRAINING(tmp_node_ptr)) { /* We search for anything that gets mapped to * DRAINING in node_state_string */ if (IS_NODE_DRAINING(node_ptr)) { match = true; break; } } else if (IS_NODE_DRAINED(tmp_node_ptr)) { /* We search for anything that gets mapped to * DRAINED in node_state_string */ if (IS_NODE_DRAINED(node_ptr)) { match = true; break; } } else if (*node_state & NODE_STATE_FLAGS) { if (*node_state & node_ptr->node_state) { match = true; break; } } else if (*node_state == NODE_STATE_ERROR) { slurm_get_select_nodeinfo( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &cpus); if (cpus) { match = true; break; } } else if (*node_state == NODE_STATE_ALLOCATED) { slurm_get_select_nodeinfo( node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &cpus); if (params.cluster_flags & CLUSTER_FLAG_BG && !cpus && (IS_NODE_ALLOCATED(node_ptr) || IS_NODE_COMPLETING(node_ptr))) cpus = node_ptr->cpus; if (cpus) { match = true; break; } } else if (*node_state == NODE_STATE_IDLE) { base_state = node_ptr->node_state & (~NODE_STATE_NO_RESPOND); if (base_state == NODE_STATE_IDLE) { match = true; break; } } else { base_state = node_ptr->node_state & NODE_STATE_BASE; if (base_state == *node_state) { match = true; break; } } } list_iterator_destroy(iterator); if (!match) return true; } return false; }
/* * _query_server - download the current server state * part_pptr IN/OUT - partition information message * node_pptr IN/OUT - node information message * block_pptr IN/OUT - BlueGene block data * reserv_pptr IN/OUT - reservation information message * clear_old IN - If set, then always replace old data, needed when going * between clusters. * RET zero or error code */ static int _query_server(partition_info_msg_t ** part_pptr, node_info_msg_t ** node_pptr, block_info_msg_t ** block_pptr, reserve_info_msg_t ** reserv_pptr, bool clear_old) { static partition_info_msg_t *old_part_ptr = NULL, *new_part_ptr; static node_info_msg_t *old_node_ptr = NULL, *new_node_ptr; static block_info_msg_t *old_bg_ptr = NULL, *new_bg_ptr; static reserve_info_msg_t *old_resv_ptr = NULL, *new_resv_ptr; int error_code; uint16_t show_flags = 0; int cc; node_info_t *node_ptr; if (params.all_flag) show_flags |= SHOW_ALL; if (old_part_ptr) { if (clear_old) old_part_ptr->last_update = 0; error_code = slurm_load_partitions(old_part_ptr->last_update, &new_part_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_partition_info_msg(old_part_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_part_ptr = old_part_ptr; } } else { error_code = slurm_load_partitions((time_t) NULL, &new_part_ptr, show_flags); } if (error_code) { slurm_perror("slurm_load_partitions"); return error_code; } old_part_ptr = new_part_ptr; *part_pptr = new_part_ptr; if (old_node_ptr) { if (clear_old) old_node_ptr->last_update = 0; if (params.node_name_single) { error_code = slurm_load_node_single(&new_node_ptr, params.nodes, show_flags); } else { error_code = slurm_load_node(old_node_ptr->last_update, &new_node_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_node_info_msg(old_node_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_node_ptr = old_node_ptr; } } else if (params.node_name_single) { error_code = slurm_load_node_single(&new_node_ptr, params.nodes, show_flags); } else { error_code = slurm_load_node((time_t) NULL, &new_node_ptr, show_flags); } if (error_code) { slurm_perror("slurm_load_node"); return error_code; } old_node_ptr = new_node_ptr; *node_pptr = new_node_ptr; /* Set the node state as NODE_STATE_MIXED. */ for (cc = 0; cc < new_node_ptr->record_count; cc++) { node_ptr = &(new_node_ptr->node_array[cc]); if (IS_NODE_DRAIN(node_ptr)) { /* don't worry about mixed since the * whole node is being drained. */ } else { uint16_t alloc_cpus = 0, err_cpus = 0, idle_cpus; int single_node_cpus = (node_ptr->cpus / g_node_scaling); select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ALLOCATED, &alloc_cpus); if (params.cluster_flags & CLUSTER_FLAG_BG) { if (!alloc_cpus && (IS_NODE_ALLOCATED(node_ptr) || IS_NODE_COMPLETING(node_ptr))) alloc_cpus = node_ptr->cpus; else alloc_cpus *= single_node_cpus; } idle_cpus = node_ptr->cpus - alloc_cpus; select_g_select_nodeinfo_get(node_ptr->select_nodeinfo, SELECT_NODEDATA_SUBCNT, NODE_STATE_ERROR, &err_cpus); if (params.cluster_flags & CLUSTER_FLAG_BG) err_cpus *= single_node_cpus; idle_cpus -= err_cpus; if ((alloc_cpus && err_cpus) || (idle_cpus && (idle_cpus != node_ptr->cpus))) { node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state |= NODE_STATE_MIXED; } } } if (old_resv_ptr) { if (clear_old) old_resv_ptr->last_update = 0; error_code = slurm_load_reservations(old_resv_ptr->last_update, &new_resv_ptr); if (error_code == SLURM_SUCCESS) slurm_free_reservation_info_msg(old_resv_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_resv_ptr = old_resv_ptr; } } else { error_code = slurm_load_reservations((time_t) NULL, &new_resv_ptr); } if (error_code) { slurm_perror("slurm_load_reservations"); return error_code; } old_resv_ptr = new_resv_ptr; *reserv_pptr = new_resv_ptr; if (!params.bg_flag) return SLURM_SUCCESS; if (params.cluster_flags & CLUSTER_FLAG_BG) { if (old_bg_ptr) { if (clear_old) old_bg_ptr->last_update = 0; error_code = slurm_load_block_info( old_bg_ptr->last_update, &new_bg_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_block_info_msg(old_bg_ptr); else if (slurm_get_errno() == SLURM_NO_CHANGE_IN_DATA) { error_code = SLURM_SUCCESS; new_bg_ptr = old_bg_ptr; } } else { error_code = slurm_load_block_info((time_t) NULL, &new_bg_ptr, show_flags); } } if (error_code) { slurm_perror("slurm_load_block"); return error_code; } old_bg_ptr = new_bg_ptr; *block_pptr = new_bg_ptr; return SLURM_SUCCESS; }
/** * basil_geometry - Check node attributes, resolve (X,Y,Z) coordinates. * * Checks both SDB database and ALPS inventory for consistency. The inventory * part is identical to basil_inventory(), with the difference of being called * before valid bitmaps exist, from select_g_node_init(). * Its dependencies are: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). */ extern int basil_geometry(struct node_record *node_ptr_array, int node_cnt) { struct node_record *node_ptr, *end = node_ptr_array + node_cnt; enum basil_version version = get_basil_version(); struct basil_inventory *inv; /* General mySQL */ MYSQL *handle; MYSQL_STMT *stmt = NULL; /* Input parameters */ unsigned int node_id; /* * Use a left outer join here since the attributes table may not be * populated for a given nodeid (e.g. when the node has been disabled * on the SMW via 'xtcli disable'). * The processor table has more authoritative information, if a nodeid * is not listed there, it does not exist. */ const char query[] = "SELECT x_coord, y_coord, z_coord," " cab_position, cab_row, cage, slot, cpu," " LOG2(coremask+1), availmem, " " processor_type " "FROM processor LEFT JOIN attributes " "ON processor_id = nodeid " "WHERE processor_id = ? "; const int PARAM_COUNT = 1; /* node id */ MYSQL_BIND params[PARAM_COUNT]; int x_coord, y_coord, z_coord; int cab, row, cage, slot, cpu; unsigned int node_cpus, node_mem; char proc_type[BASIL_STRING_SHORT]; MYSQL_BIND bind_cols[COLUMN_COUNT]; my_bool is_null[COLUMN_COUNT]; my_bool is_error[COLUMN_COUNT]; int is_gemini, i; time_t now = time(NULL); memset(params, 0, sizeof(params)); params[0].buffer_type = MYSQL_TYPE_LONG; params[0].is_unsigned = true; params[0].is_null = (my_bool *)0; params[0].buffer = (char *)&node_id; memset(bind_cols, 0, sizeof(bind_cols)); for (i = 0; i < COLUMN_COUNT; i ++) { bind_cols[i].is_null = &is_null[i]; bind_cols[i].error = &is_error[i]; if (i == COL_TYPE) { bind_cols[i].buffer_type = MYSQL_TYPE_STRING; bind_cols[i].buffer_length = sizeof(proc_type); bind_cols[i].buffer = proc_type; } else { bind_cols[i].buffer_type = MYSQL_TYPE_LONG; bind_cols[i].is_unsigned = (i >= COL_CORES); } } bind_cols[COL_X].buffer = (char *)&x_coord; bind_cols[COL_Y].buffer = (char *)&y_coord; bind_cols[COL_Z].buffer = (char *)&z_coord; bind_cols[COL_CAB].buffer = (char *)&cab; bind_cols[COL_ROW].buffer = (char *)&row; bind_cols[COL_CAGE].buffer = (char *)&cage; bind_cols[COL_SLOT].buffer = (char *)&slot; bind_cols[COL_CPU].buffer = (char *)&cpu; bind_cols[COL_CORES].buffer = (char *)&node_cpus; bind_cols[COL_MEMORY].buffer = (char *)&node_mem; inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get initial BASIL inventory"); info("BASIL %s initial INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); handle = cray_connect_sdb(); if (handle == NULL) fatal("can not connect to XTAdmin database on the SDB"); is_gemini = cray_is_gemini_system(handle); if (is_gemini < 0) fatal("can not determine Cray XT/XE system type"); stmt = prepare_stmt(handle, query, params, PARAM_COUNT, bind_cols, COLUMN_COUNT); if (stmt == NULL) fatal("can not prepare statement to resolve Cray coordinates"); for (node_ptr = node_record_table_ptr; node_ptr < end; node_ptr++) { struct basil_node *node; char *reason = NULL; if ((node_ptr->name == NULL) || (sscanf(node_ptr->name, "nid%05u", &node_id) != 1)) { error("can not read basil_node_id from %s", node_ptr->name); continue; } if (exec_stmt(stmt, query, bind_cols, COLUMN_COUNT) < 0) fatal("can not resolve %s coordinates", node_ptr->name); if (fetch_stmt(stmt) == 0) { #if _DEBUG info("proc_type:%s cpus:%u memory:%u", proc_type, node_cpus, node_mem); info("row:%u cage:%u slot:%u cpu:%u xyz:%u:%u:%u", row, cage, slot, cpu, x_coord, y_coord, z_coord); #endif if (strcmp(proc_type, "compute") != 0) { /* * Switching a compute node to be a service node * can not happen at runtime: requires a reboot. */ fatal("Node '%s' is a %s node. " "Only compute nodes can appear in slurm.conf.", node_ptr->name, proc_type); } else if (is_null[COL_CORES] || is_null[COL_MEMORY]) { /* * This can happen if a node has been disabled * on the SMW (using 'xtcli disable <nid>'). The * node will still be listed in the 'processor' * table, but have no 'attributes' entry (NULL * values for CPUs/memory). Also, the node will * be invisible to ALPS, which is why we need to * set it down here already. */ node_cpus = node_mem = 0; reason = "node data unknown - disabled on SMW?"; } else if (is_null[COL_X] || is_null[COL_Y] || is_null[COL_Z]) { /* * Similar case to the one above, observed when * a blade has been removed. Node will not * likely show up in ALPS. */ x_coord = y_coord = z_coord = 0; reason = "unknown coordinates - hardware failure?"; } else if (node_cpus < node_ptr->config_ptr->cpus) { /* * FIXME: Might reconsider this policy. * * FastSchedule is ignored here, it requires the * slurm.conf to be consistent with hardware. * * Assumption is that CPU/Memory do not change * at runtime (Cray has no hot-swappable parts). * * Hence checking it in basil_inventory() would * mean a lot of runtime overhead. */ fatal("slurm.conf: node %s has only Procs=%d", node_ptr->name, node_cpus); } else if (node_mem < node_ptr->config_ptr->real_memory) { fatal("slurm.conf: node %s has RealMemory=%d", node_ptr->name, node_mem); } } else if (is_gemini) { fatal("Non-existing Gemini node '%s' in slurm.conf", node_ptr->name); } else { fatal("Non-existing SeaStar node '%s' in slurm.conf", node_ptr->name); } if (!is_gemini) { /* * SeaStar: each node has unique coordinates */ if (node_ptr->arch == NULL) node_ptr->arch = xstrdup("XT"); } else { /* * Gemini: each 2 nodes share the same network * interface (i.e., nodes 0/1 and 2/3 each have * the same coordinates). */ if (node_ptr->arch == NULL) node_ptr->arch = xstrdup("XE"); } xfree(node_ptr->node_hostname); xfree(node_ptr->comm_name); /* * Convention: since we are using SLURM in frontend-mode, * we use Node{Addr,HostName} as follows. * * NodeAddr: <X><Y><Z> coordinates in base-36 encoding * * NodeHostName: c#-#c#s#n# using the NID convention * <cabinet>-<row><chassis><slot><node> * - each cabinet can accommodate 3 chassis (c1..c3) * - each chassis has 8 slots (s0..s7) * - each slot contains 2 or 4 nodes (n0..n3) * o either 2 service nodes (n0/n3) * o or 4 compute nodes (n0..n3) * o or 2 gemini chips (g0/g1 serving n0..n3) * * Example: c0-0c1s0n1 * - c0- = cabinet 0 * - 0 = row 0 * - c1 = chassis 1 * - s0 = slot 0 * - n1 = node 1 */ node_ptr->node_hostname = xstrdup_printf("c%u-%uc%us%un%u", cab, row, cage, slot, cpu); node_ptr->comm_name = xstrdup_printf("%c%c%c", _enc_coord(x_coord), _enc_coord(y_coord), _enc_coord(z_coord)); dim_size[0] = MAX(dim_size[0], (x_coord - 1)); dim_size[1] = MAX(dim_size[1], (y_coord - 1)); dim_size[2] = MAX(dim_size[2], (z_coord - 1)); #if _DEBUG info("%s %s %s cpus=%u, mem=%u reason=%s", node_ptr->name, node_ptr->node_hostname, node_ptr->comm_name, node_cpus, node_mem, reason); #endif /* * Check the current state reported by ALPS inventory, unless it * is already evident that the node has some other problem. */ if (reason == NULL) { for (node = inv->f->node_head; node; node = node->next) if (node->node_id == node_id) break; if (node == NULL) { reason = "not visible to ALPS - check hardware"; } else if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } } /* Base state entirely derives from ALPS * NOTE: The node bitmaps are not defined when this code is * initially executed. */ node_ptr->node_state &= NODE_STATE_FLAGS; if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ debug("Initial DOWN node %s - %s", node_ptr->name, node_ptr->reason); } else if (slurmctld_conf.slurmd_timeout && ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; } else { info("Initial DOWN node %s - %s", node_ptr->name, reason); node_ptr->reason = xstrdup(reason); /* Node state flags preserved above */ node_ptr->node_state |= NODE_STATE_DOWN; clusteracct_storage_g_node_down(acct_db_conn, node_ptr, now, NULL, slurm_get_slurm_user_id()); } } else { bool node_up_flag = IS_NODE_DOWN(node_ptr) && !IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr); node_ptr->down_time = 0; if (node_is_allocated(node)) node_ptr->node_state |= NODE_STATE_ALLOCATED; else node_ptr->node_state |= NODE_STATE_IDLE; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); xfree(node_ptr->reason); if (node_up_flag) { info("ALPS returned node %s to service", node_ptr->name); clusteracct_storage_g_node_up(acct_db_conn, node_ptr, now); } } free_stmt_result(stmt); } if (stmt_close(stmt)) error("error closing statement: %s", mysql_stmt_error(stmt)); cray_close_sdb(handle); free_inv(inv); return SLURM_SUCCESS; }