static void _send_step_complete_rpc(srun_job_t *srun_job, int step_rc) { slurm_msg_t req; step_complete_msg_t msg; int rc; memset(&msg, 0, sizeof(step_complete_msg_t)); msg.job_id = srun_job->jobid; msg.job_step_id = srun_job->stepid; msg.range_first = 0; msg.range_last = 0; msg.step_rc = step_rc; msg.jobacct = jobacctinfo_create(NULL); slurm_msg_t_init(&req); req.msg_type = REQUEST_STEP_COMPLETE; req.data = &msg; /* req.address = step_complete.parent_addr; */ debug3("Sending step complete RPC to slurmctld"); if (slurm_send_recv_controller_rc_msg(&req, &rc, working_cluster_rec) < 0) error("Error sending step complete RPC to slurmctld"); jobacctinfo_destroy(msg.jobacct); }
/* * * Returns jobacctinfo_t struct on success, NULL on error. * jobacctinfo_t must be freed after calling this function. */ int stepd_stat_jobacct(int fd, job_step_id_msg_t *sent, job_step_stat_t *resp) { int req = REQUEST_STEP_STAT; int rc = SLURM_SUCCESS; int tasks = 0; debug("Entering stepd_stat_jobacct for job %u.%u", sent->job_id, sent->step_id); safe_write(fd, &req, sizeof(int)); /* Receive the jobacct struct and return */ resp->jobacct = jobacctinfo_create(NULL); /* Do not attempt reading data until there is something to read. * Avoid locking the jobacct_gather plugin early and creating * possible deadlock. */ if (wait_fd_readable(fd, 300)) goto rwfail; rc = jobacctinfo_getinfo(resp->jobacct, JOBACCT_DATA_PIPE, &fd); safe_read(fd, &tasks, sizeof(int)); resp->num_tasks = tasks; return rc; rwfail: error("gathering job accounting: %d", rc); jobacctinfo_destroy(resp->jobacct); resp->jobacct = NULL; return rc; }
static stepd_step_rec_t * _step_setup(slurm_addr_t *cli, slurm_addr_t *self, slurm_msg_t *msg) { stepd_step_rec_t *job = NULL; switch (msg->msg_type) { case REQUEST_BATCH_JOB_LAUNCH: debug2("setup for a batch_job"); job = mgr_launch_batch_job_setup(msg->data, cli); break; case REQUEST_LAUNCH_TASKS: debug2("setup for a launch_task"); job = mgr_launch_tasks_setup(msg->data, cli, self, msg->protocol_version); break; default: fatal("handle_launch_message: Unrecognized launch RPC"); break; } if (!job) { error("_step_setup: no job returned"); return NULL; } job->jmgr_pid = getpid(); job->jobacct = jobacctinfo_create(NULL); /* Establish GRES environment variables */ if (conf->debug_flags & DEBUG_FLAG_GRES) { gres_plugin_job_state_log(job->job_gres_list, job->jobid); gres_plugin_step_state_log(job->step_gres_list, job->jobid, job->stepid); } if (msg->msg_type == REQUEST_BATCH_JOB_LAUNCH) gres_plugin_job_set_env(&job->env, job->job_gres_list, 0); else if (msg->msg_type == REQUEST_LAUNCH_TASKS) gres_plugin_step_set_env(&job->env, job->step_gres_list, 0); /* * Add slurmd node topology informations to job env array */ env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR", conf->node_topo_addr); env_array_overwrite(&job->env,"SLURM_TOPOLOGY_ADDR_PATTERN", conf->node_topo_pattern); set_msg_node_id(job); return job; }
static int _handle_stat_jobacct(int fd, stepd_step_rec_t *job, uid_t uid) { jobacctinfo_t *jobacct = NULL; jobacctinfo_t *temp_jobacct = NULL; int i = 0; int num_tasks = 0; debug("_handle_stat_jobacct for job %u.%u", job->jobid, job->stepid); debug3(" uid = %d", uid); if (uid != job->uid && !_slurm_authorized_user(uid)) { debug("stat jobacct from uid %ld for job %u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); /* Send NULL */ jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd, SLURM_PROTOCOL_VERSION); return SLURM_ERROR; } jobacct = jobacctinfo_create(NULL); debug3("num tasks = %d", job->node_tasks); for (i = 0; i < job->node_tasks; i++) { temp_jobacct = jobacct_gather_stat_task(job->task[i]->pid); if (temp_jobacct) { jobacctinfo_aggregate(jobacct, temp_jobacct); jobacctinfo_destroy(temp_jobacct); num_tasks++; } } jobacctinfo_setinfo(jobacct, JOBACCT_DATA_PIPE, &fd, SLURM_PROTOCOL_VERSION); safe_write(fd, &num_tasks, sizeof(int)); jobacctinfo_destroy(jobacct); return SLURM_SUCCESS; rwfail: jobacctinfo_destroy(jobacct); return SLURM_ERROR; }
extern int jobacct_gather_add_task(pid_t pid, jobacct_id_t *jobacct_id, int poll) { struct jobacctinfo *jobacct; if (jobacct_gather_init() < 0) return SLURM_ERROR; if (!plugin_polling) return SLURM_SUCCESS; if (_jobacct_shutdown_test()) return SLURM_ERROR; jobacct = jobacctinfo_create(jobacct_id); slurm_mutex_lock(&task_list_lock); if (pid <= 0) { error("invalid pid given (%d) for task acct", pid); goto error; } else if (!task_list) { error("no task list created!"); goto error; } jobacct->pid = pid; memcpy(&jobacct->id, jobacct_id, sizeof(jobacct_id_t)); jobacct->min_cpu = 0; debug2("adding task %u pid %d on node %u to jobacct", jobacct_id->taskid, pid, jobacct_id->nodeid); list_push(task_list, jobacct); slurm_mutex_unlock(&task_list_lock); (*(ops.add_task))(pid, jobacct_id); if (poll == 1) _poll_data(1); return SLURM_SUCCESS; error: slurm_mutex_unlock(&task_list_lock); jobacctinfo_destroy(jobacct); return SLURM_ERROR; }
static int _parse_comp_file( char *file, step_update_request_msg_t *step_msg) { int i; FILE *fd = fopen(file, "r"); char line[BUFFER_SIZE]; char *fptr; int version; char *update[MAX_RECORD_FIELDS+1]; /* End list with null entry and, possibly, more data than we expected */ enum { UPDATE_STEP_VERSION, UPDATE_STEP_EXTRA, UPDATE_STEP_INBLOCKS, UPDATE_STEP_OUTBLOCKS, UPDATE_STEP_EXITCODE, UPDATE_STEP_CPU_ALLOC, UPDATE_STEP_START, UPDATE_STEP_END, UPDATE_STEP_USER_SEC, UPDATE_STEP_SYS_SEC, UPDATE_STEP_MAX_RSS, UPDATE_STEP_UID, UPDATE_STEP_STEPNAME, UPDATE_STEP_VER1_LENGTH }; if (fd == NULL) { perror(file); return SLURM_ERROR; } if (!fgets(line, BUFFER_SIZE, fd)) { fprintf(stderr, "Empty step update completion file\n"); return SLURM_ERROR; } fptr = line; /* break the record into NULL-terminated strings */ for (i = 0; i < MAX_RECORD_FIELDS; i++) { update[i] = fptr; fptr = strstr(fptr, " "); if (fptr == NULL) { fptr = strstr(update[i], "\n"); if (fptr) *fptr = 0; break; } else *fptr++ = 0; } if (i < MAX_RECORD_FIELDS) i++; update[i] = 0; version = atoi(update[UPDATE_STEP_VERSION]); switch (version) { case 1: if (i != UPDATE_STEP_VER1_LENGTH) { fprintf(stderr, "Bad step update completion file length\n"); return SLURM_ERROR; } step_msg->jobacct = jobacctinfo_create(NULL); step_msg->exit_code = atoi(update[UPDATE_STEP_EXITCODE]); step_msg->start_time = atoi(update[UPDATE_STEP_START]); step_msg->end_time = atoi(update[UPDATE_STEP_END]); step_msg->jobacct->user_cpu_sec = atoi(update[UPDATE_STEP_USER_SEC]); step_msg->jobacct->sys_cpu_sec = atoi(update[UPDATE_STEP_SYS_SEC]); step_msg->jobacct->min_cpu = step_msg->jobacct->user_cpu_sec + step_msg->jobacct->sys_cpu_sec; step_msg->jobacct->max_rss = atoi(update[UPDATE_STEP_MAX_RSS]); step_msg->name = xstrdup(xbasename(update[UPDATE_STEP_STEPNAME])); break; default: fprintf(stderr, "Unsupported step update " "completion file version: %d\n", version); return SLURM_ERROR; break; } return SLURM_SUCCESS; }
/* * This function handles the initialization information from slurmd * sent by _send_slurmstepd_init() in src/slurmd/slurmd/req.c. */ static int _init_from_slurmd(int sock, char **argv, slurm_addr_t **_cli, slurm_addr_t **_self, slurm_msg_t **_msg, int *_ngids, gid_t **_gids) { char *incoming_buffer = NULL; Buf buffer; int step_type; int len, proto; slurm_addr_t *cli = NULL; slurm_addr_t *self = NULL; slurm_msg_t *msg = NULL; int ngids = 0; gid_t *gids = NULL; uint16_t port; char buf[16]; log_options_t lopts = LOG_OPTS_INITIALIZER; log_init(argv[0], lopts, LOG_DAEMON, NULL); /* receive job type from slurmd */ safe_read(sock, &step_type, sizeof(int)); debug3("step_type = %d", step_type); /* receive reverse-tree info from slurmd */ slurm_mutex_lock(&step_complete.lock); safe_read(sock, &step_complete.rank, sizeof(int)); safe_read(sock, &step_complete.parent_rank, sizeof(int)); safe_read(sock, &step_complete.children, sizeof(int)); safe_read(sock, &step_complete.depth, sizeof(int)); safe_read(sock, &step_complete.max_depth, sizeof(int)); safe_read(sock, &step_complete.parent_addr, sizeof(slurm_addr_t)); step_complete.bits = bit_alloc(step_complete.children); step_complete.jobacct = jobacctinfo_create(NULL); slurm_mutex_unlock(&step_complete.lock); /* receive conf from slurmd */ if ((conf = read_slurmd_conf_lite (sock)) == NULL) fatal("Failed to read conf from slurmd"); log_alter(conf->log_opts, 0, conf->logfile); log_set_timefmt(conf->log_fmt); debug2("debug level is %d.", conf->debug_level); switch_g_slurmd_step_init(); slurm_get_ip_str(&step_complete.parent_addr, &port, buf, 16); debug3("slurmstepd rank %d, parent address = %s, port = %u", step_complete.rank, buf, port); /* receive cli from slurmd */ safe_read(sock, &len, sizeof(int)); incoming_buffer = xmalloc(sizeof(char) * len); safe_read(sock, incoming_buffer, len); buffer = create_buf(incoming_buffer,len); cli = xmalloc(sizeof(slurm_addr_t)); if (slurm_unpack_slurm_addr_no_alloc(cli, buffer) == SLURM_ERROR) fatal("slurmstepd: problem with unpack of slurmd_conf"); free_buf(buffer); /* receive self from slurmd */ safe_read(sock, &len, sizeof(int)); if (len > 0) { /* receive packed self from main slurmd */ incoming_buffer = xmalloc(sizeof(char) * len); safe_read(sock, incoming_buffer, len); buffer = create_buf(incoming_buffer,len); self = xmalloc(sizeof(slurm_addr_t)); if (slurm_unpack_slurm_addr_no_alloc(self, buffer) == SLURM_ERROR) { fatal("slurmstepd: problem with unpack of " "slurmd_conf"); } free_buf(buffer); } /* Receive GRES information from slurmd */ gres_plugin_recv_stepd(sock); /* Grab the slurmd's spooldir. Has %n expanded. */ cpu_freq_init(conf); /* Receive cpu_frequency info from slurmd */ cpu_freq_recv_info(sock); /* get the protocol version of the srun */ safe_read(sock, &proto, sizeof(int)); /* receive req from slurmd */ safe_read(sock, &len, sizeof(int)); incoming_buffer = xmalloc(sizeof(char) * len); safe_read(sock, incoming_buffer, len); buffer = create_buf(incoming_buffer,len); msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); msg->protocol_version = (uint16_t)proto; switch (step_type) { case LAUNCH_BATCH_JOB: msg->msg_type = REQUEST_BATCH_JOB_LAUNCH; break; case LAUNCH_TASKS: msg->msg_type = REQUEST_LAUNCH_TASKS; break; default: fatal("%s: Unrecognized launch RPC (%d)", __func__, step_type); break; } if (unpack_msg(msg, buffer) == SLURM_ERROR) fatal("slurmstepd: we didn't unpack the request correctly"); free_buf(buffer); /* receive cached group ids array for the relevant uid */ safe_read(sock, &ngids, sizeof(int)); if (ngids > 0) { int i; uint32_t tmp32; gids = (gid_t *)xmalloc(sizeof(gid_t) * ngids); for (i = 0; i < ngids; i++) { safe_read(sock, &tmp32, sizeof(uint32_t)); gids[i] = (gid_t)tmp32; debug2("got gid %d", gids[i]); } } *_cli = cli; *_self = self; *_msg = msg; *_ngids = ngids; *_gids = gids; return 1; rwfail: fatal("Error reading initialization data from slurmd"); exit(1); }