/** * do_basil_confirm - confirm an existing BASIL reservation. * This requires the alloc_sid to equal the session ID (getsid()) of the process * executing the aprun/mpirun commands * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error. */ extern int do_basil_confirm(struct job_record *job_ptr) { uint32_t resv_id; uint64_t pagg_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id == 0) { /* On Cray XT/XE, a reservation ID of 0 is always invalid. */ error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) { error("can not read pagg ID for JobId=%u", job_ptr->job_id); } else { int rc; if (pagg_id == 0) { #ifdef HAVE_REAL_CRAY /* This fallback case is for interactive jobs only */ error("JobId %u has no pagg ID, falling back to SID", job_ptr->job_id); #endif pagg_id = job_ptr->alloc_sid; } rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id); if (rc == 0) { debug2("confirmed ALPS resId %u for JobId %u, pagg " "%"PRIu64"", resv_id, job_ptr->job_id, pagg_id); return SLURM_SUCCESS; } else if (rc == -BE_NO_RESID) { /* * If ALPS can not find the reservation ID we are trying * to confirm, it may be that the job has already been * canceled, or that the reservation has timed out after * waiting for the confirmation. * It is more likely that this error occurs on a per-job * basis, hence in this case do not drain frontend node. */ error("JobId %u has invalid ALPS resId %u - job " "already canceled?", job_ptr->job_id, resv_id); return SLURM_SUCCESS; } else { error("confirming ALPS resId %u of JobId %u FAILED: %s", resv_id, job_ptr->job_id, basil_strerror(rc)); if (is_transient_error(rc)) return READY_JOB_ERROR; } } return READY_JOB_FATAL; }
/** * do_basil_confirm - confirm an existing BASIL reservation. * This requires the alloc_sid to equal the session ID (getsid()) of the process * executing the aprun/mpirun commands * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error. */ extern int do_basil_confirm(struct job_record *job_ptr) { uint32_t resv_id; uint64_t pagg_id; if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) { error("can not read resId for JobId=%u", job_ptr->job_id); } else if (resv_id == 0) { /* On Cray XT/XE, a reservation ID of 0 is always invalid. */ error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id); } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) { error("can not read pagg ID for JobId=%u", job_ptr->job_id); } else { int rc; if (pagg_id == 0) { /* This fallback case is for interactive jobs only */ error("JobId %u has no pagg ID, falling back to SID", job_ptr->job_id); pagg_id = job_ptr->alloc_sid; } rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id); if (rc == 0) { debug2("confirmed ALPS resId %u for JobId %u, " "pagg %"PRIu64"", resv_id, job_ptr->job_id, pagg_id); return SLURM_SUCCESS; } else { error("confirming ALPS resId %u of JobId %u FAILED: %s", resv_id, job_ptr->job_id, basil_strerror(rc)); if (is_transient_error(rc)) return READY_JOB_ERROR; } } return READY_JOB_FATAL; }