/* * slurm_checkpoint_tasks - send checkpoint request to tasks of * specified step * IN job_id: job ID of step * IN step_id: step ID of step * IN image_dir: location to store ckpt images. parameter to plugin. * IN max_wait: seconds to wait for the operation to complete * IN nodelist: nodes to send the request * RET: 0 on success, non-zero on failure with errno set */ extern int slurm_checkpoint_tasks(uint32_t job_id, uint16_t step_id, time_t begin_time, char *image_dir, uint16_t max_wait, char *nodelist) { return checkpoint_tasks(job_id, step_id, begin_time, image_dir, max_wait, nodelist); }
/* Checkpoint processing pthread * Never returns, but is cancelled on plugin termiantion */ static void *_ckpt_agent_thr(void *arg) { struct ckpt_req *req = (struct ckpt_req *)arg; int rc; /* Locks: write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; struct job_record *job_ptr; struct step_record *step_ptr; struct check_job_info *check_ptr; /* only perform ckpt operation of ONE JOB */ slurm_mutex_lock(&ckpt_agent_mutex); while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) { pthread_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex); } ckpt_agent_jobid = req->job_id; ckpt_agent_count ++; slurm_mutex_unlock(&ckpt_agent_mutex); debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u", req->op, req->job_id, req->step_id); rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time, req->image_dir, req->wait, req->nodelist); if (rc != SLURM_SUCCESS) { error("checkpoint/blcr: error on checkpoint request %u to " "%u.%u: %s", req->op, req->job_id, req->step_id, slurm_strerror(rc)); } if (req->op == CHECK_REQUEUE) _requeue_when_finished(req->job_id); lock_slurmctld(job_write_lock); job_ptr = find_job_record(req->job_id); if (!job_ptr) { error("_ckpt_agent_thr: job finished"); goto out; } if (req->step_id == SLURM_BATCH_SCRIPT) { /* batch job */ check_ptr = (struct check_job_info *)job_ptr->check_job; } else { step_ptr = find_step_record(job_ptr, req->step_id); if (! step_ptr) { error("_ckpt_agent_thr: step finished"); goto out; } check_ptr = (struct check_job_info *)step_ptr->check_job; } check_ptr->time_stamp = 0; check_ptr->error_code = rc; if (check_ptr->error_code != SLURM_SUCCESS) check_ptr->error_msg = xstrdup(slurm_strerror(rc)); out: unlock_slurmctld(job_write_lock); if (req->sig_done) { _send_sig(req->job_id, req->step_id, req->sig_done, req->nodelist); } _on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id, req->image_dir, rc); slurm_mutex_lock(&ckpt_agent_mutex); ckpt_agent_count --; if (ckpt_agent_count == 0) { ckpt_agent_jobid = 0; pthread_cond_broadcast(&ckpt_agent_cond); } slurm_mutex_unlock(&ckpt_agent_mutex); _ckpt_req_free(req); return NULL; }