extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; inv = get_full_inventory(version); if (inv == NULL) /* FIXME: should retry here if the condition is transient */ fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else node_ptr->node_rank = inv->nodes_total - rank_count++; sprintf(tmp, "nid%05u", node->node_id); hostlist_push(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); return SLURM_SUCCESS; }
/** * basil_get_initial_state - set SLURM initial node state from ALPS. * * The logic is identical to basil_inventory(), with the difference that this * is called before valid bitmaps exist, from select_g_node_init(). It relies * on the following other parts: * - it needs reset_job_bitmaps() in order to rebuild node_bitmap fields, * - it relies on _sync_nodes_to_jobs() to * o kill active jobs on nodes now marked DOWN, * o reset node state to ALLOCATED if it has been marked IDLE here (which is * an error case, since there is no longer an ALPS reservation for the job, * this is caught by the subsequent basil_inventory()). * Return: SLURM_SUCCESS if ok, non-zero on error. */ static int basil_get_initial_state(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INITIAL INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) continue; if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ node_ptr->node_state &= NODE_STATE_FLAGS; if (reason) { if (node_ptr->reason) { debug3("Initial DOWN node %s - %s", node_ptr->name, node_ptr->reason); } else { debug("Initial DOWN node %s - %s", node_ptr->name, reason); node_ptr->reason = xstrdup(reason); } node_ptr->node_state |= NODE_STATE_DOWN; } else { if (node_is_allocated(node)) node_ptr->node_state |= NODE_STATE_ALLOCATED; else node_ptr->node_state |= NODE_STATE_IDLE; xfree(node_ptr->reason); } } free_inv(inv); return SLURM_SUCCESS; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } if (reason) { if (!IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); debug("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down(node_ptr->name, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; if (job_iter == NULL) fatal("list_iterator_create: malloc failure"); while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); if (job_ptr == NULL) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); basil_safe_release(rsvn->rsvn_id, inv); slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) /* ALPS will take some time, do not schedule now. */ return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; return rc; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; node_rank_inv = 1; /* * When obtaining the initial configuration, we can not allow ALPS to * fail. If there is a problem at this stage it is better to restart * SLURM completely, after investigating (and/or fixing) the cause. */ inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); else if (inv->batch_total < node_cnt) info("Warning: ALPS sees only %d/%d slurm.conf nodes, " "check DownNodes", inv->batch_total, node_cnt); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else if ((slurmctld_conf.fast_schedule != 2) && (node->cpu_count != node_ptr->config_ptr->cpus)) { fatal("slurm.conf: node %s has %u cpus " "but configured as CPUs=%u in your slurm.conf", node_ptr->name, node->cpu_count, node_ptr->config_ptr->cpus); } else if ((slurmctld_conf.fast_schedule != 2) && (node->mem_size != node_ptr->config_ptr->real_memory)) { fatal("slurm.conf: node %s has RealMemory=%u " "but configured as RealMemory=%u in your " "slurm.conf", node_ptr->name, node->mem_size, node_ptr->config_ptr->real_memory); } else { node_ptr->node_rank = inv->nodes_total - rank_count++; /* * Convention: since we are using SLURM in * frontend-mode, we use * NodeHostName as follows. * * NodeHostName: c#-#c#s#n# using the NID convention * <cabinet>-<row><chassis><slot><node> * - each cabinet can accommodate 3 chassis (c1..c3) * - each chassis has 8 slots (s0..s7) * - each slot contains 2 or 4 nodes (n0..n3) * o either 2 service nodes (n0/n3) * o or 4 compute nodes (n0..n3) * o or 2 gemini chips (g0/g1 serving n0..n3) * * Example: c0-0c1s0n1 * - c0- = cabinet 0 * - 0 = row 0 * - c1 = chassis 1 * - s0 = slot 0 * - n1 = node 1 */ xfree(node_ptr->node_hostname); node_ptr->node_hostname = xstrdup(node->name); } sprintf(tmp, "nid%05u", node->node_id); hostlist_push_host(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); node_rank_inv = 0; return SLURM_SUCCESS; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; int rel_rc; time_t now = time(NULL); static time_t slurm_alps_mismatch_time = (time_t) 0; static bool logged_sync_timeout = false; static time_t last_inv_run = 0; if ((now - last_inv_run) < inv_interval) return SLURM_SUCCESS; last_inv_run = now; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); /* Avoid checking for inv->batch_avail here since if we are gang scheduling returning an error for a full system is probably the wrong thing to do. (the schedule() function in the slurmctld will never run ;)). */ if (!inv->f->node_head || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { int node_inx; struct node_record *node_ptr; char *reason = NULL; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } node_inx = node_ptr - node_record_table_ptr; if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ } else if ((slurmctld_conf.slurmd_timeout == 0) || ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(avail_node_bitmap, node_inx); } else { xfree(node_ptr->reason); info("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down_ptr(node_ptr, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); node_ptr->down_time = 0; info("MARKING %s UP", node_ptr->name); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { xfree(node_ptr->reason); node_ptr->reason_time = 0; node_ptr->reason_uid = NO_VAL; clusteracct_storage_g_node_up( acct_db_conn, node_ptr, now); } } else if (IS_NODE_NO_RESPOND(node_ptr)) { node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { bit_set(avail_node_bitmap, node_inx); } } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); /* * Changed to ignore reservations for "UNKNOWN" batch * ids (e.g. the interactive region) (Chris North) */ if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); rel_rc = basil_safe_release(rsvn->rsvn_id, inv); if (rel_rc) { error("ALPS reservation %u removal FAILED: %s", rsvn->rsvn_id, basil_strerror(rel_rc)); } else { debug("ALPS reservation %u removed", rsvn->rsvn_id); } slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) { /* If SLURM and ALPS state are not in synchronization, * do not schedule any more jobs until waiting at least * SyncTimeout seconds. */ if (slurm_alps_mismatch_time == 0) { slurm_alps_mismatch_time = now; } else if (cray_conf->sync_timeout == 0) { /* Wait indefinitely */ } else if (difftime(now, slurm_alps_mismatch_time) < cray_conf->sync_timeout) { return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } else if (!logged_sync_timeout) { error("Could not synchronize SLURM with ALPS for %u " "seconds, proceeding with job scheduling", cray_conf->sync_timeout); logged_sync_timeout = true; } } else { slurm_alps_mismatch_time = 0; logged_sync_timeout = false; } return rc; }
extern int basil_node_ranking(struct node_record *node_array, int node_cnt) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; int rank_count = 0, i; hostlist_t hl = hostlist_create(NULL); bool bad_node = 0; /* * When obtaining the initial configuration, we can not allow ALPS to * fail. If there is a problem at this stage it is better to restart * SLURM completely, after investigating (and/or fixing) the cause. */ inv = get_full_inventory(version); if (inv == NULL) fatal("failed to get BASIL %s ranking", bv_names_long[version]); else if (!inv->batch_total) fatal("system has no usable batch compute nodes"); else if (inv->batch_total < node_cnt) info("Warning: ALPS sees only %d/%d slurm.conf nodes, " "check DownNodes", inv->batch_total, node_cnt); debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes", bv_names_long[version], inv->batch_avail, inv->batch_total); /* * Node ranking is based on a subset of the inventory: only nodes in * batch allocation mode which are up and not allocated. Assign a * 'NO_VAL' rank to all other nodes, which will translate as a very * high value, (unsigned)-2, to put those nodes last in the ranking. * The rest of the code must ensure that those nodes are never chosen. */ for (i = 0; i < node_cnt; i++) node_array[i].node_rank = NO_VAL; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char tmp[50]; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); bad_node = 1; } else node_ptr->node_rank = inv->nodes_total - rank_count++; sprintf(tmp, "nid%05u", node->node_id); hostlist_push(hl, tmp); } free_inv(inv); if (bad_node) { hostlist_sort(hl); char *name = hostlist_ranged_string_xmalloc(hl); info("It appears your slurm.conf nodelist doesn't " "match the alps system. Here are the nodes alps knows " "about\n%s", name); } hostlist_destroy(hl); return SLURM_SUCCESS; }