/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; int rel_rc; time_t now = time(NULL); static time_t slurm_alps_mismatch_time = (time_t) 0; static bool logged_sync_timeout = false; static time_t last_inv_run = 0; if ((now - last_inv_run) < inv_interval) return SLURM_SUCCESS; last_inv_run = now; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); /* Avoid checking for inv->batch_avail here since if we are gang scheduling returning an error for a full system is probably the wrong thing to do. (the schedule() function in the slurmctld will never run ;)). */ if (!inv->f->node_head || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { int node_inx; struct node_record *node_ptr; char *reason = NULL; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } node_inx = node_ptr - node_record_table_ptr; if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ } else if ((slurmctld_conf.slurmd_timeout == 0) || ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(avail_node_bitmap, node_inx); } else { xfree(node_ptr->reason); info("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down_ptr(node_ptr, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); node_ptr->down_time = 0; info("MARKING %s UP", node_ptr->name); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { xfree(node_ptr->reason); node_ptr->reason_time = 0; node_ptr->reason_uid = NO_VAL; clusteracct_storage_g_node_up( acct_db_conn, node_ptr, now); } } else if (IS_NODE_NO_RESPOND(node_ptr)) { node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { bit_set(avail_node_bitmap, node_inx); } } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); /* * Changed to ignore reservations for "UNKNOWN" batch * ids (e.g. the interactive region) (Chris North) */ if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); rel_rc = basil_safe_release(rsvn->rsvn_id, inv); if (rel_rc) { error("ALPS reservation %u removal FAILED: %s", rsvn->rsvn_id, basil_strerror(rel_rc)); } else { debug("ALPS reservation %u removed", rsvn->rsvn_id); } slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) { /* If SLURM and ALPS state are not in synchronization, * do not schedule any more jobs until waiting at least * SyncTimeout seconds. */ if (slurm_alps_mismatch_time == 0) { slurm_alps_mismatch_time = now; } else if (cray_conf->sync_timeout == 0) { /* Wait indefinitely */ } else if (difftime(now, slurm_alps_mismatch_time) < cray_conf->sync_timeout) { return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } else if (!logged_sync_timeout) { error("Could not synchronize SLURM with ALPS for %u " "seconds, proceeding with job scheduling", cray_conf->sync_timeout); logged_sync_timeout = true; } } else { slurm_alps_mismatch_time = 0; logged_sync_timeout = false; } return rc; }
/* * ping_nodes - check that all nodes and daemons are alive, * get nodes in UNKNOWN state to register */ void ping_nodes (void) { static bool restart_flag = true; /* system just restarted */ static int offset = 0; /* mutex via node table write lock on entry */ static int max_reg_threads = 0; /* max node registration threads * this can include DOWN nodes, so * limit the number to avoid huge * communication delays */ int i; time_t now, still_live_time, node_dead_time; static time_t last_ping_time = (time_t) 0; hostlist_t down_hostlist = NULL; char *host_str = NULL; agent_arg_t *ping_agent_args = NULL; agent_arg_t *reg_agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr = NULL; #else struct node_record *node_ptr = NULL; #endif now = time (NULL); ping_agent_args = xmalloc (sizeof (agent_arg_t)); ping_agent_args->msg_type = REQUEST_PING; ping_agent_args->retry = 0; ping_agent_args->hostlist = hostlist_create(""); reg_agent_args = xmalloc (sizeof (agent_arg_t)); reg_agent_args->msg_type = REQUEST_NODE_REGISTRATION_STATUS; reg_agent_args->retry = 0; reg_agent_args->hostlist = hostlist_create(""); /* * If there are a large number of down nodes, the node ping * can take a long time to complete: * ping_time = down_nodes * agent_timeout / agent_parallelism * ping_time = down_nodes * 10_seconds / 10 * ping_time = down_nodes (seconds) * Because of this, we extend the SlurmdTimeout by the * time needed to complete a ping of all nodes. */ if ((slurmctld_conf.slurmd_timeout == 0) || (last_ping_time == (time_t) 0)) { node_dead_time = (time_t) 0; } else { node_dead_time = last_ping_time - slurmctld_conf.slurmd_timeout; } still_live_time = now - (slurmctld_conf.slurmd_timeout / 3); last_ping_time = now; if (max_reg_threads == 0) { max_reg_threads = MAX(slurm_get_tree_width(), 1); } offset += max_reg_threads; if ((offset > node_record_count) && (offset >= (max_reg_threads * MAX_REG_FREQUENCY))) offset = 0; #ifdef HAVE_FRONT_END for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(front_end_ptr)) && (!IS_NODE_NO_RESPOND(front_end_ptr))) continue; if ((front_end_ptr->last_response != (time_t) 0) && (front_end_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(front_end_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, front_end_ptr->name); else { down_hostlist = hostlist_create(front_end_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_front_end_down(front_end_ptr, "Not responding"); front_end_ptr->not_responding = false; continue; } if (restart_flag) { front_end_ptr->last_response = slurmctld_conf.last_update; } /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(front_end_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, front_end_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(front_end_ptr)) && (front_end_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(front_end_ptr) && IS_NODE_DOWN(front_end_ptr)) continue; hostlist_push(ping_agent_args->hostlist, front_end_ptr->name); ping_agent_args->node_count++; } #else for (i=0, node_ptr=node_record_table_ptr; i<node_record_count; i++, node_ptr++) { if (IS_NODE_FUTURE(node_ptr) || IS_NODE_POWER_SAVE(node_ptr)) continue; if ((slurmctld_conf.slurmd_timeout == 0) && (!restart_flag) && (!IS_NODE_UNKNOWN(node_ptr)) && (!IS_NODE_NO_RESPOND(node_ptr))) continue; if ((node_ptr->last_response != (time_t) 0) && (node_ptr->last_response <= node_dead_time) && (!IS_NODE_DOWN(node_ptr))) { if (down_hostlist) (void) hostlist_push_host(down_hostlist, node_ptr->name); else { down_hostlist = hostlist_create(node_ptr->name); if (down_hostlist == NULL) fatal("hostlist_create: malloc error"); } set_node_down_ptr(node_ptr, "Not responding"); node_ptr->not_responding = false; /* logged below */ continue; } if (restart_flag) node_ptr->last_response = slurmctld_conf.last_update; /* Request a node registration if its state is UNKNOWN or * on a periodic basis (about every MAX_REG_FREQUENCY ping, * this mechanism avoids an additional (per node) timer or * counter and gets updated configuration information * once in a while). We limit these requests since they * can generate a flood of incoming RPCs. */ if (IS_NODE_UNKNOWN(node_ptr) || restart_flag || ((i >= offset) && (i < (offset + max_reg_threads)))) { hostlist_push(reg_agent_args->hostlist, node_ptr->name); reg_agent_args->node_count++; continue; } if ((!IS_NODE_NO_RESPOND(node_ptr)) && (node_ptr->last_response >= still_live_time)) continue; /* Do not keep pinging down nodes since this can induce * huge delays in hierarchical communication fail-over */ if (IS_NODE_NO_RESPOND(node_ptr) && IS_NODE_DOWN(node_ptr)) continue; hostlist_push(ping_agent_args->hostlist, node_ptr->name); ping_agent_args->node_count++; } #endif restart_flag = false; if (ping_agent_args->node_count == 0) { hostlist_destroy(ping_agent_args->hostlist); xfree (ping_agent_args); } else { hostlist_uniq(ping_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( ping_agent_args->hostlist); debug("Spawning ping agent for %s", host_str); xfree(host_str); ping_begin(); agent_queue_request(ping_agent_args); } if (reg_agent_args->node_count == 0) { hostlist_destroy(reg_agent_args->hostlist); xfree (reg_agent_args); } else { hostlist_uniq(reg_agent_args->hostlist); host_str = hostlist_ranged_string_xmalloc( reg_agent_args->hostlist); debug("Spawning registration agent for %s %d hosts", host_str, reg_agent_args->node_count); xfree(host_str); ping_begin(); agent_queue_request(reg_agent_args); } if (down_hostlist) { hostlist_uniq(down_hostlist); host_str = hostlist_ranged_string_xmalloc(down_hostlist); error("Nodes %s not responding, setting DOWN", host_str); xfree(host_str); hostlist_destroy(down_hostlist); } }