/** * do_basil_confirm - confirm an existing BASIL reservation. * This requires the alloc_sid to equal the session ID (getsid()) of the process * executing the aprun/mpirun commands * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error. */ extern int do_basil_confirm(struct job_record *job_ptr) { uint32_t resv_id; uint64_t pagg_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id == 0) { /* On Cray XT/XE, a reservation ID of 0 is always invalid. */ error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) { error("can not read pagg ID for JobId=%u", job_ptr->job_id); } else { int rc; if (pagg_id == 0) { #ifdef HAVE_REAL_CRAY /* This fallback case is for interactive jobs only */ error("JobId %u has no pagg ID, falling back to SID", job_ptr->job_id); #endif pagg_id = job_ptr->alloc_sid; } rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id); if (rc == 0) { debug2("confirmed ALPS resId %u for JobId %u, pagg " "%"PRIu64"", resv_id, job_ptr->job_id, pagg_id); return SLURM_SUCCESS; } else if (rc == -BE_NO_RESID) { /* * If ALPS can not find the reservation ID we are trying * to confirm, it may be that the job has already been * canceled, or that the reservation has timed out after * waiting for the confirmation. * It is more likely that this error occurs on a per-job * basis, hence in this case do not drain frontend node. */ error("JobId %u has invalid ALPS resId %u - job " "already canceled?", job_ptr->job_id, resv_id); return SLURM_SUCCESS; } else { error("confirming ALPS resId %u of JobId %u FAILED: %s", resv_id, job_ptr->job_id, basil_strerror(rc)); if (is_transient_error(rc)) return READY_JOB_ERROR; } } return READY_JOB_FATAL; }
/** * do_basil_confirm - confirm an existing BASIL reservation. * This requires the alloc_sid to equal the session ID (getsid()) of the process * executing the aprun/mpirun commands * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error. */ extern int do_basil_confirm(struct job_record *job_ptr) { uint32_t resv_id; uint64_t pagg_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id == 0) { /* On Cray XT/XE, a reservation ID of 0 is always invalid. */ error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) { error("can not read pagg ID for JobId=%u", job_ptr->job_id); } else { int rc; if (pagg_id == 0) { /* This fallback case is for interactive jobs only */ error("JobId %u has no pagg ID, falling back to SID", job_ptr->job_id); pagg_id = job_ptr->alloc_sid; } rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id); if (rc == 0) { debug2("confirmed ALPS resId %u for JobId %u, " "pagg %"PRIu64"", resv_id, job_ptr->job_id, pagg_id); return SLURM_SUCCESS; } else { error("confirming ALPS resId %u of JobId %u FAILED: %s", resv_id, job_ptr->job_id, basil_strerror(rc)); if (is_transient_error(rc)) return READY_JOB_ERROR; } } return READY_JOB_FATAL; }
/** * do_basil_switch - suspend/resume BASIL reservation * IN job_ptr - pointer to job which has just been deallocated resources * IN suspend - to suspend or not to suspend * RET see below */ extern int do_basil_switch(struct job_record *job_ptr, bool suspend) { uint32_t resv_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id && basil_switch(resv_id, suspend) == 0) { /* The resv_id is non-zero only if the job is or was running. */ debug("%s ALPS resId %u for JobId %u", suspend ? "Suspended" : "Resumed", resv_id, job_ptr->job_id); } return SLURM_SUCCESS; }
/** * do_basil_signal - pass job signal on to any APIDs * IN job_ptr - job to be signalled * IN signal - signal(7) number * Only signal job if an ALPS reservation exists (non-0 reservation ID). */ extern int do_basil_signal(struct job_record *job_ptr, int signal) { uint32_t resv_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id != 0) { int rc = basil_signal_apids(resv_id, signal, NULL); if (rc) error("could not signal APIDs of resId %u: %s", resv_id, basil_strerror(rc)); } return SLURM_SUCCESS; }
/** * queue_basil_signal - queue job signal on to any APIDs * IN job_ptr - job to be signalled * IN signal - signal(7) number * IN delay - how long to delay the signal, in seconds * Only signal job if an ALPS reservation exists (non-0 reservation ID). */ extern void queue_basil_signal(struct job_record *job_ptr, int signal, uint16_t delay) { args_sig_basil_t *args_sig_basil; pthread_attr_t attr_sig_basil; pthread_t thread_sig_basil; uint32_t resv_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); return; } if (resv_id == 0) return; if ((delay == 0) || (delay == (uint16_t) NO_VAL)) { /* Send the signal now */ int rc = basil_signal_apids(resv_id, signal, NULL); if (rc) error("could not signal APIDs of resId %u: %s", resv_id, basil_strerror(rc)); return; } /* Create a thread to send the signal later */ slurm_attr_init(&attr_sig_basil); if (pthread_attr_setdetachstate(&attr_sig_basil, PTHREAD_CREATE_DETACHED)) { error("pthread_attr_setdetachstate error %m"); slurm_attr_destroy(&attr_sig_basil); return; } args_sig_basil = xmalloc(sizeof(args_sig_basil_t)); args_sig_basil->resv_id = resv_id; args_sig_basil->signal = signal; args_sig_basil->delay = delay; if (pthread_create(&thread_sig_basil, &attr_sig_basil, _sig_basil, (void *) args_sig_basil)) { error("pthread_create error %m"); slurm_attr_destroy(&attr_sig_basil); xfree(args_sig_basil); return; } slurm_attr_destroy(&attr_sig_basil); }
/** * do_basil_release - release an (unconfirmed) BASIL reservation * IN job_ptr - pointer to job which has just been deallocated resources * RET see below */ extern int do_basil_release(struct job_record *job_ptr) { uint32_t resv_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id && basil_release(resv_id) == 0) { /* The resv_id is non-zero only if the job is or was running. */ debug("released ALPS resId %u for JobId %u", resv_id, job_ptr->job_id); } /* * Error handling: we only print out the errors (basil_release does this * internally), but do not signal error to select_g_job_fini(). Calling * contexts of this function (deallocate_nodes, batch_finish) only print * additional error text: no further action is taken at this stage. */ return SLURM_SUCCESS; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { struct node_record *node_ptr; char *reason = NULL; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } if (reason) { if (!IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); debug("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down(node_ptr->name, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; if (job_iter == NULL) fatal("list_iterator_create: malloc failure"); while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); if (job_ptr == NULL) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); basil_safe_release(rsvn->rsvn_id, inv); slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) /* ALPS will take some time, do not schedule now. */ return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; return rc; }
/** * basil_inventory - Periodic node-state query via ALPS XML-RPC. * This should be run immediately before each scheduling cycle. * Returns non-SLURM_SUCCESS if * - INVENTORY method failed (error) * - no nodes are available (no point in scheduling) * - orphaned ALPS reservation exists (wait until ALPS resynchronizes) */ extern int basil_inventory(void) { enum basil_version version = get_basil_version(); struct basil_inventory *inv; struct basil_node *node; struct basil_rsvn *rsvn; int slurm_alps_mismatch = 0; int rc = SLURM_SUCCESS; int rel_rc; time_t now = time(NULL); static time_t slurm_alps_mismatch_time = (time_t) 0; static bool logged_sync_timeout = false; static time_t last_inv_run = 0; if ((now - last_inv_run) < inv_interval) return SLURM_SUCCESS; last_inv_run = now; inv = get_full_inventory(version); if (inv == NULL) { error("BASIL %s INVENTORY failed", bv_names_long[version]); return SLURM_ERROR; } debug("BASIL %s INVENTORY: %d/%d batch nodes available", bv_names_long[version], inv->batch_avail, inv->batch_total); /* Avoid checking for inv->batch_avail here since if we are gang scheduling returning an error for a full system is probably the wrong thing to do. (the schedule() function in the slurmctld will never run ;)). */ if (!inv->f->node_head || !inv->batch_total) rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; for (node = inv->f->node_head; node; node = node->next) { int node_inx; struct node_record *node_ptr; char *reason = NULL; /* This will ignore interactive nodes when iterating through * the apbasil inventory. If we don't do this, SLURM is * unable to resolve the ID to a nidXXX name since it's not in * the slurm.conf file. (Chris North) */ if (node->role == BNR_INTER) continue; node_ptr = _find_node_by_basil_id(node->node_id); if (node_ptr == NULL) { error("nid%05u (%s node in state %s) not in slurm.conf", node->node_id, nam_noderole[node->role], nam_nodestate[node->state]); continue; } node_inx = node_ptr - node_record_table_ptr; if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) { /* * ALPS still hangs on to the node while SLURM considers * it already unallocated. Possible causes are partition * cleanup taking too long (can be 10sec ... minutes), * and orphaned ALPS reservations (caught below). * * The converse case (SLURM hanging on to the node while * ALPS has already freed it) happens frequently during * job completion: select_g_job_fini() is called before * make_node_comp(). Rely on SLURM logic for this case. */ slurm_alps_mismatch++; } if (node->state == BNS_DOWN) { reason = "ALPS marked it DOWN"; } else if (node->state == BNS_UNAVAIL) { reason = "node is UNAVAILABLE"; } else if (node->state == BNS_ROUTE) { reason = "node does ROUTING"; } else if (node->state == BNS_SUSPECT) { reason = "entered SUSPECT mode"; } else if (node->state == BNS_ADMINDOWN) { reason = "node is ADMINDOWN"; } else if (node->state != BNS_UP) { reason = "state not UP"; } else if (node->role != BNR_BATCH) { reason = "mode not BATCH"; } else if (node->arch != BNA_XT) { reason = "arch not XT/XE"; } /* Base state entirely derives from ALPS */ if (reason) { if (node_ptr->down_time == 0) node_ptr->down_time = now; if (IS_NODE_DOWN(node_ptr)) { /* node still down */ } else if ((slurmctld_conf.slurmd_timeout == 0) || ((now - node_ptr->down_time) < slurmctld_conf.slurmd_timeout)) { node_ptr->node_state |= NODE_STATE_NO_RESPOND; bit_clear(avail_node_bitmap, node_inx); } else { xfree(node_ptr->reason); info("MARKING %s DOWN (%s)", node_ptr->name, reason); /* set_node_down also kills any running jobs */ set_node_down_ptr(node_ptr, reason); } } else if (IS_NODE_DOWN(node_ptr)) { xfree(node_ptr->reason); node_ptr->down_time = 0; info("MARKING %s UP", node_ptr->name); /* Reset state, make_node_idle figures out the rest */ node_ptr->node_state &= NODE_STATE_FLAGS; node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); node_ptr->node_state |= NODE_STATE_UNKNOWN; make_node_idle(node_ptr, NULL); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { xfree(node_ptr->reason); node_ptr->reason_time = 0; node_ptr->reason_uid = NO_VAL; clusteracct_storage_g_node_up( acct_db_conn, node_ptr, now); } } else if (IS_NODE_NO_RESPOND(node_ptr)) { node_ptr->node_state &= (~NODE_STATE_NO_RESPOND); if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) { bit_set(avail_node_bitmap, node_inx); } } } if (slurm_alps_mismatch) debug("ALPS: %d node(s) still held", slurm_alps_mismatch); /* * Check that each ALPS reservation corresponds to a SLURM job. * Purge orphaned reservations, which may result from stale or * messed up system state, or are indicative of ALPS problems * (stuck in pending cancel calls). */ for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) { ListIterator job_iter = list_iterator_create(job_list); struct job_record *job_ptr; uint32_t resv_id; while ((job_ptr = (struct job_record *)list_next(job_iter))) { if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) == SLURM_SUCCESS && resv_id == rsvn->rsvn_id) break; } list_iterator_destroy(job_iter); /* * Changed to ignore reservations for "UNKNOWN" batch * ids (e.g. the interactive region) (Chris North) */ if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) { error("orphaned ALPS reservation %u, trying to remove", rsvn->rsvn_id); rel_rc = basil_safe_release(rsvn->rsvn_id, inv); if (rel_rc) { error("ALPS reservation %u removal FAILED: %s", rsvn->rsvn_id, basil_strerror(rel_rc)); } else { debug("ALPS reservation %u removed", rsvn->rsvn_id); } slurm_alps_mismatch = true; } } free_inv(inv); if (slurm_alps_mismatch) { /* If SLURM and ALPS state are not in synchronization, * do not schedule any more jobs until waiting at least * SyncTimeout seconds. */ if (slurm_alps_mismatch_time == 0) { slurm_alps_mismatch_time = now; } else if (cray_conf->sync_timeout == 0) { /* Wait indefinitely */ } else if (difftime(now, slurm_alps_mismatch_time) < cray_conf->sync_timeout) { return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE; } else if (!logged_sync_timeout) { error("Could not synchronize SLURM with ALPS for %u " "seconds, proceeding with job scheduling", cray_conf->sync_timeout); logged_sync_timeout = true; } } else { slurm_alps_mismatch_time = 0; logged_sync_timeout = false; } return rc; }