/* * srun_user_message - Send arbitrary message to an srun job (no job steps) */ extern int srun_user_message(struct job_record *job_ptr, char *msg) { slurm_addr_t * addr; srun_user_msg_t *msg_arg; xassert(job_ptr); if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) return ESLURM_ALREADY_DONE; if (job_ptr->other_port && job_ptr->resp_host && job_ptr->resp_host[0]) { addr = xmalloc(sizeof(struct sockaddr_in)); slurm_set_addr(addr, job_ptr->other_port, job_ptr->resp_host); msg_arg = xmalloc(sizeof(srun_user_msg_t)); msg_arg->job_id = job_ptr->job_id; msg_arg->msg = xstrdup(msg); _srun_agent_launch(addr, job_ptr->resp_host, SRUN_USER_MSG, msg_arg); return SLURM_SUCCESS; } else if (job_ptr->batch_flag && IS_JOB_RUNNING(job_ptr)) { #ifndef HAVE_FRONT_END struct node_record *node_ptr; #endif job_notify_msg_t *notify_msg_ptr; agent_arg_t *agent_arg_ptr; #ifdef HAVE_FRONT_END if (job_ptr->batch_host == NULL) return ESLURM_DISABLED; /* no allocated nodes */ agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t)); agent_arg_ptr->hostlist = hostlist_create(job_ptr->batch_host); #else node_ptr = find_first_node_record(job_ptr->node_bitmap); if (node_ptr == NULL) return ESLURM_DISABLED; /* no allocated nodes */ agent_arg_ptr = (agent_arg_t *) xmalloc(sizeof(agent_arg_t)); agent_arg_ptr->hostlist = hostlist_create(node_ptr->name); #endif if (agent_arg_ptr->hostlist == NULL) fatal("hostlist_create: malloc failure"); notify_msg_ptr = (job_notify_msg_t *) xmalloc(sizeof(job_notify_msg_t)); notify_msg_ptr->job_id = job_ptr->job_id; notify_msg_ptr->message = xstrdup(msg); agent_arg_ptr->node_count = 1; agent_arg_ptr->retry = 0; agent_arg_ptr->msg_type = REQUEST_JOB_NOTIFY; agent_arg_ptr->msg_args = (void *) notify_msg_ptr; /* Launch the RPC via agent */ agent_queue_request(agent_arg_ptr); return SLURM_SUCCESS; } return ESLURM_DISABLED; }
/* * The remainder of this file implements the standard SLURM checkpoint API. */ extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id, struct step_record *step_ptr, uint16_t op, uint16_t data, char *image_dir, time_t * event_time, uint32_t *error_code, char **error_msg ) { int rc = SLURM_SUCCESS; struct check_job_info *check_ptr; uint16_t done_sig = 0; struct job_record *job_ptr; struct node_record *node_ptr; pthread_attr_t attr; pthread_t ckpt_agent_tid = 0; char *nodelist; struct ckpt_req *req_ptr; /* job/step checked already */ job_ptr = find_job_record(job_id); if (!job_ptr) return ESLURM_INVALID_JOB_ID; if (step_id == SLURM_BATCH_SCRIPT) { check_ptr = (struct check_job_info *)job_ptr->check_job; node_ptr = find_first_node_record(job_ptr->node_bitmap); nodelist = node_ptr->name; } else { step_ptr = find_step_record(job_ptr, step_id); if (!step_ptr) return ESLURM_INVALID_JOB_ID; check_ptr = (struct check_job_info *)step_ptr->check_job; nodelist = step_ptr->step_layout->node_list; } xassert(check_ptr); switch (op) { case CHECK_ABLE: if (check_ptr->disabled) rc = ESLURM_DISABLED; else { *event_time = check_ptr->time_stamp; rc = SLURM_SUCCESS; } break; case CHECK_DISABLE: check_ptr->disabled++; break; case CHECK_ENABLE: check_ptr->disabled--; break; case CHECK_REQUEUE: if (step_id != SLURM_BATCH_SCRIPT) { rc = ESLURM_NOT_SUPPORTED; break; } /* no break */ case CHECK_VACATE: done_sig = SIGTERM; /* no break */ case CHECK_CREATE: if (check_ptr->disabled) { rc = ESLURM_DISABLED; break; } if (check_ptr->time_stamp != 0) { rc = EALREADY; break; } check_ptr->time_stamp = time(NULL); check_ptr->error_code = 0; xfree(check_ptr->error_msg); req_ptr = xmalloc(sizeof(struct ckpt_req)); if (!req_ptr) { rc = ENOMEM; break; } req_ptr->gid = job_ptr->group_id; req_ptr->uid = job_ptr->user_id; req_ptr->job_id = job_id; req_ptr->step_id = step_id; req_ptr->begin_time = check_ptr->time_stamp; req_ptr->wait = data; req_ptr->image_dir = xstrdup(image_dir); req_ptr->nodelist = xstrdup(nodelist); req_ptr->sig_done = done_sig; req_ptr->op = op; slurm_attr_init(&attr); if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) { error("pthread_attr_setdetachstate: %m"); rc = errno; break; } if (pthread_create(&ckpt_agent_tid, &attr, _ckpt_agent_thr, req_ptr)) { error("pthread_create: %m"); rc = errno; break; } slurm_attr_destroy(&attr); break; case CHECK_RESTART: if (step_id != SLURM_BATCH_SCRIPT) { rc = ESLURM_NOT_SUPPORTED; break; } /* create a batch job from saved desc */ rc = ESLURM_NOT_SUPPORTED; /* TODO: save job script */ break; case CHECK_ERROR: xassert(error_code); xassert(error_msg); *error_code = check_ptr->error_code; xfree(*error_msg); *error_msg = xstrdup(check_ptr->error_msg); break; default: error("Invalid checkpoint operation: %d", op); rc = EINVAL; } return rc; }