Exemplo n.º 1
0
/**
 * do_basil_confirm - confirm an existing BASIL reservation.
 * This requires the alloc_sid to equal the session ID (getsid()) of the process
 * executing the aprun/mpirun commands
 * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error.
 */
extern int do_basil_confirm(struct job_record *job_ptr)
{
    uint32_t resv_id;
    uint64_t pagg_id;

    if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
                            SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
        error("can not read resId for JobId=%u", job_ptr->job_id);
    } else if (resv_id == 0) {
        /* On Cray XT/XE, a reservation ID of 0 is always invalid. */
        error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id);
    } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
                                   SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) {
        error("can not read pagg ID for JobId=%u", job_ptr->job_id);
    } else {
        int rc;

        if (pagg_id == 0) {
#ifdef HAVE_REAL_CRAY
            /* This fallback case is for interactive jobs only */
            error("JobId %u has no pagg ID, falling back to SID",
                  job_ptr->job_id);
#endif
            pagg_id = job_ptr->alloc_sid;
        }

        rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id);
        if (rc == 0) {
            debug2("confirmed ALPS resId %u for JobId %u, pagg "
                   "%"PRIu64"", resv_id, job_ptr->job_id, pagg_id);
            return SLURM_SUCCESS;
        } else if (rc == -BE_NO_RESID) {
            /*
             * If ALPS can not find the reservation ID we are trying
             * to confirm, it may be that the job has already been
             * canceled, or that the reservation has timed out after
             * waiting for the confirmation.
             * It is more likely that this error occurs on a per-job
             * basis, hence in this case do not drain frontend node.
             */
            error("JobId %u has invalid ALPS resId %u - job "
                  "already canceled?", job_ptr->job_id, resv_id);
            return SLURM_SUCCESS;
        } else {
            error("confirming ALPS resId %u of JobId %u FAILED: %s",
                  resv_id, job_ptr->job_id, basil_strerror(rc));

            if (is_transient_error(rc))
                return READY_JOB_ERROR;
        }
    }
    return READY_JOB_FATAL;
}
Exemplo n.º 2
0
/**
 * do_basil_confirm - confirm an existing BASIL reservation.
 * This requires the alloc_sid to equal the session ID (getsid()) of the process
 * executing the aprun/mpirun commands
 * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error.
 */
extern int do_basil_confirm(struct job_record *job_ptr)
{
	uint32_t resv_id;
	uint64_t pagg_id;

	if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
			SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
		error("can not read resId for JobId=%u", job_ptr->job_id);
	} else if (resv_id == 0) {
		/* On Cray XT/XE, a reservation ID of 0 is always invalid. */
		error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id);
	} else if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
			SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) {
		error("can not read pagg ID for JobId=%u", job_ptr->job_id);
	} else {
		int rc;

		if (pagg_id == 0) {
			/* This fallback case is for interactive jobs only */
			error("JobId %u has no pagg ID, falling back to SID",
				job_ptr->job_id);
			pagg_id = job_ptr->alloc_sid;
		}

		rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id);
		if (rc == 0) {
			debug2("confirmed ALPS resId %u for JobId %u, "
				"pagg %"PRIu64"",
				resv_id, job_ptr->job_id, pagg_id);
			return SLURM_SUCCESS;
		} else {
			error("confirming ALPS resId %u of JobId %u FAILED: %s",
				resv_id, job_ptr->job_id, basil_strerror(rc));

			if (is_transient_error(rc))
				return READY_JOB_ERROR;
		}
	}
	return READY_JOB_FATAL;
}