Esempio n. 1
0
/*
 * slurm_checkpoint_tasks - send checkpoint request to tasks of
 *     specified step
 * IN job_id: job ID of step
 * IN step_id: step ID of step
 * IN image_dir: location to store ckpt images. parameter to plugin.
 * IN max_wait: seconds to wait for the operation to complete
 * IN nodelist: nodes to send the request
 * RET: 0 on success, non-zero on failure with errno set
 */
extern int
slurm_checkpoint_tasks(uint32_t job_id, uint16_t step_id, time_t begin_time,
		       char *image_dir, uint16_t max_wait, char *nodelist)
{
	return checkpoint_tasks(job_id, step_id, begin_time,
				image_dir, max_wait, nodelist);
}
Esempio n. 2
0
/* Checkpoint processing pthread
 * Never returns, but is cancelled on plugin termiantion */
static void *_ckpt_agent_thr(void *arg)
{
	struct ckpt_req *req = (struct ckpt_req *)arg;
	int rc;
	/* Locks: write job */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };
	struct job_record *job_ptr;
	struct step_record *step_ptr;
	struct check_job_info *check_ptr;

	/* only perform ckpt operation of ONE JOB */
	slurm_mutex_lock(&ckpt_agent_mutex);
	while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) {
		pthread_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex);
	}
	ckpt_agent_jobid = req->job_id;
	ckpt_agent_count ++;
	slurm_mutex_unlock(&ckpt_agent_mutex);

	debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u",
	       req->op, req->job_id, req->step_id);

	rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time,
			      req->image_dir, req->wait, req->nodelist);
	if (rc != SLURM_SUCCESS) {
		error("checkpoint/blcr: error on checkpoint request %u to "
		      "%u.%u: %s", req->op, req->job_id, req->step_id,
		      slurm_strerror(rc));
	}
	if (req->op == CHECK_REQUEUE)
		_requeue_when_finished(req->job_id);

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(req->job_id);
	if (!job_ptr) {
		error("_ckpt_agent_thr: job finished");
		goto out;
	}
	if (req->step_id == SLURM_BATCH_SCRIPT) {	/* batch job */
		check_ptr = (struct check_job_info *)job_ptr->check_job;
	} else {
		step_ptr = find_step_record(job_ptr, req->step_id);
		if (! step_ptr) {
			error("_ckpt_agent_thr: step finished");
			goto out;
		}
		check_ptr = (struct check_job_info *)step_ptr->check_job;
	}
	check_ptr->time_stamp = 0;
	check_ptr->error_code = rc;
	if (check_ptr->error_code != SLURM_SUCCESS)
		check_ptr->error_msg = xstrdup(slurm_strerror(rc));

 out:
	unlock_slurmctld(job_write_lock);

	if (req->sig_done) {
		_send_sig(req->job_id, req->step_id, req->sig_done,
			  req->nodelist);
	}

	_on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id,
			  req->image_dir, rc);

	slurm_mutex_lock(&ckpt_agent_mutex);
	ckpt_agent_count --;
	if (ckpt_agent_count == 0) {
		ckpt_agent_jobid = 0;
		pthread_cond_broadcast(&ckpt_agent_cond);
	}
	slurm_mutex_unlock(&ckpt_agent_mutex);
	_ckpt_req_free(req);
	return NULL;
}