void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) { if (!user_mode(regs)) die("Exception in kernel mode", regs, signo); _send_sig(signo, code, addr); }
/* Send specified signal only to the process launched on node 0. * If the request times out, send sig_timeout. */ static int _step_sig(struct step_record * step_ptr, uint16_t wait, uint16_t signal, uint16_t sig_timeout) { struct check_job_info *check_ptr; struct job_record *job_ptr; int i; xassert(step_ptr); check_ptr = (struct check_job_info *) step_ptr->check_job; xassert(check_ptr); job_ptr = step_ptr->job_ptr; xassert(job_ptr); if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; if (check_ptr->disabled) return ESLURM_DISABLED; check_ptr->node_cnt = 0; /* re-calculate below */ for (i = 0; i < node_record_count; i++) { if (bit_test(step_ptr->step_node_bitmap, i) == 0) continue; if (check_ptr->node_cnt++ > 0) continue; _send_sig(step_ptr->job_ptr->job_id, step_ptr->step_id, signal, node_record_table_ptr[i].name, node_record_table_ptr[i].slurm_addr); _ckpt_enqueue_timeout(step_ptr->job_ptr->job_id, step_ptr->step_id, check_ptr->time_stamp, sig_timeout, wait, node_record_table_ptr[i].name, node_record_table_ptr[i].slurm_addr); } if (!check_ptr->node_cnt) { error("_step_sig: job %u.%u has no nodes", job_ptr->job_id, step_ptr->step_id); return ESLURM_INVALID_NODE_NAME; } check_ptr->time_stamp = time(NULL); check_ptr->wait_time = wait; info("checkpoint requested for job %u.%u", job_ptr->job_id, step_ptr->step_id); return SLURM_SUCCESS; }
/* Identify every XCPU process in a specific node and signal it. * Return the process count */ extern int xcpu_signal(int sig, char *nodes) { int procs = 0; hostlist_t hl; char *node, sig_msg[64], dir_path[128], ctl_path[200]; DIR *dir; struct dirent *sub_dir; /* Translate "nodes" to a hostlist */ hl = hostlist_create(nodes); if (hl == NULL) { error("hostlist_create: %m"); return 0; } /* Plan 9 only takes strings, so we map number to name */ snprintf(sig_msg, sizeof(sig_msg), "signal %s", _sig_name(sig)); /* For each node, look for processes */ while ((node = hostlist_shift(hl))) { snprintf(dir_path, sizeof(dir_path), "%s/%s/xcpu", XCPU_DIR, node); free(node); if ((dir = opendir(dir_path)) == NULL) { error("opendir(%s): %m", dir_path); continue; } while ((sub_dir = readdir(dir))) { snprintf(ctl_path, sizeof(ctl_path), "%s/%s/ctl",dir_path, sub_dir->d_name); procs += _send_sig(ctl_path, sig, sig_msg); } closedir(dir); } hostlist_destroy(hl); return procs; }
/* Checkpoint processing pthread * Never returns, but is cancelled on plugin termiantion */ static void *_ckpt_agent_thr(void *arg) { struct ckpt_req *req = (struct ckpt_req *)arg; int rc; /* Locks: write job */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; struct job_record *job_ptr; struct step_record *step_ptr; struct check_job_info *check_ptr; /* only perform ckpt operation of ONE JOB */ slurm_mutex_lock(&ckpt_agent_mutex); while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) { pthread_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex); } ckpt_agent_jobid = req->job_id; ckpt_agent_count ++; slurm_mutex_unlock(&ckpt_agent_mutex); debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u", req->op, req->job_id, req->step_id); rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time, req->image_dir, req->wait, req->nodelist); if (rc != SLURM_SUCCESS) { error("checkpoint/blcr: error on checkpoint request %u to " "%u.%u: %s", req->op, req->job_id, req->step_id, slurm_strerror(rc)); } if (req->op == CHECK_REQUEUE) _requeue_when_finished(req->job_id); lock_slurmctld(job_write_lock); job_ptr = find_job_record(req->job_id); if (!job_ptr) { error("_ckpt_agent_thr: job finished"); goto out; } if (req->step_id == SLURM_BATCH_SCRIPT) { /* batch job */ check_ptr = (struct check_job_info *)job_ptr->check_job; } else { step_ptr = find_step_record(job_ptr, req->step_id); if (! step_ptr) { error("_ckpt_agent_thr: step finished"); goto out; } check_ptr = (struct check_job_info *)step_ptr->check_job; } check_ptr->time_stamp = 0; check_ptr->error_code = rc; if (check_ptr->error_code != SLURM_SUCCESS) check_ptr->error_msg = xstrdup(slurm_strerror(rc)); out: unlock_slurmctld(job_write_lock); if (req->sig_done) { _send_sig(req->job_id, req->step_id, req->sig_done, req->nodelist); } _on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id, req->image_dir, rc); slurm_mutex_lock(&ckpt_agent_mutex); ckpt_agent_count --; if (ckpt_agent_count == 0) { ckpt_agent_jobid = 0; pthread_cond_broadcast(&ckpt_agent_cond); } slurm_mutex_unlock(&ckpt_agent_mutex); _ckpt_req_free(req); return NULL; }
asmlinkage void handle_trap_3_c(struct pt_regs *fp) { _send_sig(SIGILL, ILL_ILLTRP, fp->ea); }
asmlinkage void handle_trap_2_c(struct pt_regs *fp) { _send_sig(SIGUSR2, 0, fp->ea); }
static void _ckpt_signal_step(struct ckpt_timeout_info *rec) { /* debug("signal %u.%u %u", rec->job_id, rec->step_id, rec->signal); */ _send_sig(rec->job_id, rec->step_id, rec->signal, rec->node_name, rec->node_addr); }