Beispiel #1
0
/**
 * do_basil_confirm - confirm an existing BASIL reservation.
 * This requires the alloc_sid to equal the session ID (getsid()) of the process
 * executing the aprun/mpirun commands
 * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error.
 */
extern int do_basil_confirm(struct job_record *job_ptr)
{
    uint32_t resv_id;
    uint64_t pagg_id;

    if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
                            SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
        error("can not read resId for JobId=%u", job_ptr->job_id);
    } else if (resv_id == 0) {
        /* On Cray XT/XE, a reservation ID of 0 is always invalid. */
        error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id);
    } else if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
                                   SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) {
        error("can not read pagg ID for JobId=%u", job_ptr->job_id);
    } else {
        int rc;

        if (pagg_id == 0) {
#ifdef HAVE_REAL_CRAY
            /* This fallback case is for interactive jobs only */
            error("JobId %u has no pagg ID, falling back to SID",
                  job_ptr->job_id);
#endif
            pagg_id = job_ptr->alloc_sid;
        }

        rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id);
        if (rc == 0) {
            debug2("confirmed ALPS resId %u for JobId %u, pagg "
                   "%"PRIu64"", resv_id, job_ptr->job_id, pagg_id);
            return SLURM_SUCCESS;
        } else if (rc == -BE_NO_RESID) {
            /*
             * If ALPS can not find the reservation ID we are trying
             * to confirm, it may be that the job has already been
             * canceled, or that the reservation has timed out after
             * waiting for the confirmation.
             * It is more likely that this error occurs on a per-job
             * basis, hence in this case do not drain frontend node.
             */
            error("JobId %u has invalid ALPS resId %u - job "
                  "already canceled?", job_ptr->job_id, resv_id);
            return SLURM_SUCCESS;
        } else {
            error("confirming ALPS resId %u of JobId %u FAILED: %s",
                  resv_id, job_ptr->job_id, basil_strerror(rc));

            if (is_transient_error(rc))
                return READY_JOB_ERROR;
        }
    }
    return READY_JOB_FATAL;
}
Beispiel #2
0
/**
 * do_basil_confirm - confirm an existing BASIL reservation.
 * This requires the alloc_sid to equal the session ID (getsid()) of the process
 * executing the aprun/mpirun commands
 * Returns: SLURM_SUCCESS if ok, READY_JOB_ERROR/FATAL on transient/fatal error.
 */
extern int do_basil_confirm(struct job_record *job_ptr)
{
	uint32_t resv_id;
	uint64_t pagg_id;

	if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
			SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
		error("can not read resId for JobId=%u", job_ptr->job_id);
	} else if (resv_id == 0) {
		/* On Cray XT/XE, a reservation ID of 0 is always invalid. */
		error("JobId=%u has invalid (ZERO) resId", job_ptr->job_id);
	} else if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
			SELECT_JOBDATA_PAGG_ID, &pagg_id) != SLURM_SUCCESS) {
		error("can not read pagg ID for JobId=%u", job_ptr->job_id);
	} else {
		int rc;

		if (pagg_id == 0) {
			/* This fallback case is for interactive jobs only */
			error("JobId %u has no pagg ID, falling back to SID",
				job_ptr->job_id);
			pagg_id = job_ptr->alloc_sid;
		}

		rc = basil_confirm(resv_id, job_ptr->job_id, pagg_id);
		if (rc == 0) {
			debug2("confirmed ALPS resId %u for JobId %u, "
				"pagg %"PRIu64"",
				resv_id, job_ptr->job_id, pagg_id);
			return SLURM_SUCCESS;
		} else {
			error("confirming ALPS resId %u of JobId %u FAILED: %s",
				resv_id, job_ptr->job_id, basil_strerror(rc));

			if (is_transient_error(rc))
				return READY_JOB_ERROR;
		}
	}
	return READY_JOB_FATAL;
}
Beispiel #3
0
/**
 * do_basil_switch - suspend/resume BASIL reservation
 * IN job_ptr - pointer to job which has just been deallocated resources
 * IN suspend - to suspend or not to suspend
 * RET see below
 */
extern int do_basil_switch(struct job_record *job_ptr, bool suspend)
{
    uint32_t resv_id;

    if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
                            SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
        error("can not read resId for JobId=%u", job_ptr->job_id);
    } else if (resv_id && basil_switch(resv_id, suspend) == 0) {
        /* The resv_id is non-zero only if the job is or was running. */
        debug("%s ALPS resId %u for JobId %u",
              suspend ? "Suspended" : "Resumed",
              resv_id, job_ptr->job_id);
    }
    return SLURM_SUCCESS;
}
Beispiel #4
0
/**
 * do_basil_signal  -  pass job signal on to any APIDs
 * IN job_ptr - job to be signalled
 * IN signal  - signal(7) number
 * Only signal job if an ALPS reservation exists (non-0 reservation ID).
 */
extern int do_basil_signal(struct job_record *job_ptr, int signal)
{
	uint32_t resv_id;

	if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
			SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
		error("can not read resId for JobId=%u", job_ptr->job_id);
	} else if (resv_id != 0) {
		int rc = basil_signal_apids(resv_id, signal, NULL);

		if (rc)
			error("could not signal APIDs of resId %u: %s", resv_id,
				basil_strerror(rc));
	}
	return SLURM_SUCCESS;
}
Beispiel #5
0
/**
 * queue_basil_signal  -  queue job signal on to any APIDs
 * IN job_ptr - job to be signalled
 * IN signal  - signal(7) number
 * IN delay   - how long to delay the signal, in seconds
 * Only signal job if an ALPS reservation exists (non-0 reservation ID).
 */
extern void queue_basil_signal(struct job_record *job_ptr, int signal,
                               uint16_t delay)
{
    args_sig_basil_t *args_sig_basil;
    pthread_attr_t attr_sig_basil;
    pthread_t thread_sig_basil;
    uint32_t resv_id;

    if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
                            SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
        error("can not read resId for JobId=%u", job_ptr->job_id);
        return;
    }
    if (resv_id == 0)
        return;
    if ((delay == 0) || (delay == (uint16_t) NO_VAL)) {
        /* Send the signal now */
        int rc = basil_signal_apids(resv_id, signal, NULL);

        if (rc)
            error("could not signal APIDs of resId %u: %s", resv_id,
                  basil_strerror(rc));
        return;
    }

    /* Create a thread to send the signal later */
    slurm_attr_init(&attr_sig_basil);
    if (pthread_attr_setdetachstate(&attr_sig_basil,
                                    PTHREAD_CREATE_DETACHED)) {
        error("pthread_attr_setdetachstate error %m");
        slurm_attr_destroy(&attr_sig_basil);
        return;
    }
    args_sig_basil = xmalloc(sizeof(args_sig_basil_t));
    args_sig_basil->resv_id = resv_id;
    args_sig_basil->signal  = signal;
    args_sig_basil->delay   = delay;
    if (pthread_create(&thread_sig_basil, &attr_sig_basil,
                       _sig_basil, (void *) args_sig_basil)) {
        error("pthread_create error %m");
        slurm_attr_destroy(&attr_sig_basil);
        xfree(args_sig_basil);
        return;
    }
    slurm_attr_destroy(&attr_sig_basil);
}
Beispiel #6
0
/**
 * do_basil_release - release an (unconfirmed) BASIL reservation
 * IN job_ptr - pointer to job which has just been deallocated resources
 * RET see below
 */
extern int do_basil_release(struct job_record *job_ptr)
{
	uint32_t resv_id;

	if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
			SELECT_JOBDATA_RESV_ID, &resv_id) != SLURM_SUCCESS) {
		error("can not read resId for JobId=%u", job_ptr->job_id);
	} else if (resv_id && basil_release(resv_id) == 0) {
		/* The resv_id is non-zero only if the job is or was running. */
		debug("released ALPS resId %u for JobId %u",
		      resv_id, job_ptr->job_id);
	}
	/*
	 * Error handling: we only print out the errors (basil_release does this
	 * internally), but do not signal error to select_g_job_fini(). Calling
	 * contexts of this function (deallocate_nodes, batch_finish) only print
	 * additional error text: no further action is taken at this stage.
	 */
	return SLURM_SUCCESS;
}
Beispiel #7
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	if (!inv->f->node_head || !inv->batch_avail || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char *reason = NULL;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		if (reason) {
			if (!IS_NODE_DOWN(node_ptr)) {
				xfree(node_ptr->reason);
				debug("MARKING %s DOWN (%s)",
				      node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down(node_ptr->name, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		if (job_iter == NULL)
			fatal("list_iterator_create: malloc failure");

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {

			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		if (job_ptr == NULL) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			basil_safe_release(rsvn->rsvn_id, inv);
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch)
		/* ALPS will take some time, do not schedule now. */
		return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
	return rc;
}
Beispiel #8
0
/**
 * basil_inventory - Periodic node-state query via ALPS XML-RPC.
 * This should be run immediately before each scheduling cycle.
 * Returns non-SLURM_SUCCESS if
 * - INVENTORY method failed (error)
 * - no nodes are available (no point in scheduling)
 * - orphaned ALPS reservation exists (wait until ALPS resynchronizes)
 */
extern int basil_inventory(void)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	struct basil_rsvn *rsvn;
	int slurm_alps_mismatch = 0;
	int rc = SLURM_SUCCESS;
	int rel_rc;
	time_t now = time(NULL);
	static time_t slurm_alps_mismatch_time = (time_t) 0;
	static bool logged_sync_timeout = false;
	static time_t last_inv_run = 0;

	if ((now - last_inv_run) < inv_interval)
		return SLURM_SUCCESS;

	last_inv_run = now;

	inv = get_full_inventory(version);
	if (inv == NULL) {
		error("BASIL %s INVENTORY failed", bv_names_long[version]);
		return SLURM_ERROR;
	}

	debug("BASIL %s INVENTORY: %d/%d batch nodes available",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/* Avoid checking for inv->batch_avail here since if we are
	   gang scheduling returning an error for a full system is
	   probably the wrong thing to do. (the schedule() function
	   in the slurmctld will never run ;)).
	*/
	if (!inv->f->node_head || !inv->batch_total)
		rc = ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;

	for (node = inv->f->node_head; node; node = node->next) {
		int node_inx;
		struct node_record *node_ptr;
		char *reason = NULL;

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			continue;
		}
		node_inx = node_ptr - node_record_table_ptr;

		if (node_is_allocated(node) && !IS_NODE_ALLOCATED(node_ptr)) {
			/*
			 * ALPS still hangs on to the node while SLURM considers
			 * it already unallocated. Possible causes are partition
			 * cleanup taking too long (can be 10sec ... minutes),
			 * and orphaned ALPS reservations (caught below).
			 *
			 * The converse case (SLURM hanging on to the node while
			 * ALPS has already freed it) happens frequently during
			 * job completion: select_g_job_fini() is called before
			 * make_node_comp(). Rely on SLURM logic for this case.
			 */
			slurm_alps_mismatch++;
		}

		if (node->state == BNS_DOWN) {
			reason = "ALPS marked it DOWN";
		} else if (node->state == BNS_UNAVAIL) {
			reason = "node is UNAVAILABLE";
		} else if (node->state == BNS_ROUTE) {
			reason = "node does ROUTING";
		} else if (node->state == BNS_SUSPECT) {
			reason = "entered SUSPECT mode";
		} else if (node->state == BNS_ADMINDOWN) {
			reason = "node is ADMINDOWN";
		} else if (node->state != BNS_UP) {
			reason = "state not UP";
		} else if (node->role != BNR_BATCH) {
			reason = "mode not BATCH";
		} else if (node->arch != BNA_XT) {
			reason = "arch not XT/XE";
		}

		/* Base state entirely derives from ALPS */
		if (reason) {
			if (node_ptr->down_time == 0)
				node_ptr->down_time = now;
			if (IS_NODE_DOWN(node_ptr)) {
				/* node still down */
			} else if ((slurmctld_conf.slurmd_timeout == 0) ||
				   ((now - node_ptr->down_time) <
				    slurmctld_conf.slurmd_timeout)) {
				node_ptr->node_state |= NODE_STATE_NO_RESPOND;
				bit_clear(avail_node_bitmap, node_inx);
			} else {
				xfree(node_ptr->reason);
				info("MARKING %s DOWN (%s)",
				     node_ptr->name, reason);
				/* set_node_down also kills any running jobs */
				set_node_down_ptr(node_ptr, reason);
			}
		} else if (IS_NODE_DOWN(node_ptr)) {
			xfree(node_ptr->reason);
			node_ptr->down_time = 0;
			info("MARKING %s UP", node_ptr->name);

			/* Reset state, make_node_idle figures out the rest */
			node_ptr->node_state &= NODE_STATE_FLAGS;
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			node_ptr->node_state |= NODE_STATE_UNKNOWN;

			make_node_idle(node_ptr, NULL);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				xfree(node_ptr->reason);
				node_ptr->reason_time = 0;
				node_ptr->reason_uid = NO_VAL;
				clusteracct_storage_g_node_up(
					acct_db_conn, node_ptr, now);
			}
		} else if (IS_NODE_NO_RESPOND(node_ptr)) {
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			if (!IS_NODE_DRAIN(node_ptr) &&
			    !IS_NODE_FAIL(node_ptr)) {
				bit_set(avail_node_bitmap, node_inx);
			}
		}
	}

	if (slurm_alps_mismatch)
		debug("ALPS: %d node(s) still held", slurm_alps_mismatch);

	/*
	 * Check that each ALPS reservation corresponds to a SLURM job.
	 * Purge orphaned reservations, which may result from stale or
	 * messed up system state, or are indicative of ALPS problems
	 * (stuck in pending cancel calls).
	 */
	for (rsvn = inv->f->rsvn_head; rsvn; rsvn = rsvn->next) {
		ListIterator job_iter = list_iterator_create(job_list);
		struct job_record *job_ptr;
		uint32_t resv_id;

		while ((job_ptr = (struct job_record *)list_next(job_iter))) {
			if (_get_select_jobinfo(job_ptr->select_jobinfo->data,
						SELECT_JOBDATA_RESV_ID,
						&resv_id) == SLURM_SUCCESS
			    && resv_id == rsvn->rsvn_id)
				break;
		}
		list_iterator_destroy(job_iter);

		/*
		 * Changed to ignore reservations for "UNKNOWN" batch
		 * ids (e.g. the interactive region) (Chris North)
		 */

		if ((job_ptr == NULL) && (xstrcmp(rsvn->batch_id, "UNKNOWN"))) {
			error("orphaned ALPS reservation %u, trying to remove",
			      rsvn->rsvn_id);
			rel_rc = basil_safe_release(rsvn->rsvn_id, inv);
			if (rel_rc) {
				error("ALPS reservation %u removal FAILED: %s",
				      rsvn->rsvn_id, basil_strerror(rel_rc));
			} else {
				debug("ALPS reservation %u removed",
				      rsvn->rsvn_id);
			}
			slurm_alps_mismatch = true;
		}
	}
	free_inv(inv);

	if (slurm_alps_mismatch) {
		/* If SLURM and ALPS state are not in synchronization,
		 * do not schedule any more jobs until waiting at least
		 * SyncTimeout seconds. */
		if (slurm_alps_mismatch_time == 0) {
			slurm_alps_mismatch_time = now;
		} else if (cray_conf->sync_timeout == 0) {
			/* Wait indefinitely */
		} else if (difftime(now, slurm_alps_mismatch_time) <
			   cray_conf->sync_timeout) {
			return ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE;
		} else if (!logged_sync_timeout) {
			error("Could not synchronize SLURM with ALPS for %u "
			      "seconds, proceeding with job scheduling",
			      cray_conf->sync_timeout);
			logged_sync_timeout = true;
		}
	} else {
		slurm_alps_mismatch_time = 0;
		logged_sync_timeout = false;
	}
	return rc;
}