static void _save_dbd_state(void) { char *dbd_fname; Buf buffer; int fd, rc, wrote = 0; uint16_t msg_type; uint32_t offset; dbd_fname = slurm_get_state_save_location(); xstrcat(dbd_fname, "/dbd.messages"); (void) unlink(dbd_fname); /* clear save state */ fd = open(dbd_fname, O_WRONLY | O_CREAT | O_TRUNC, 0600); if (fd < 0) { error("slurmdbd: Creating state save file %s", dbd_fname); } else if (agent_list && list_count(agent_list)) { char curr_ver_str[10]; snprintf(curr_ver_str, sizeof(curr_ver_str), "VER%d", SLURM_PROTOCOL_VERSION); buffer = init_buf(strlen(curr_ver_str)); packstr(curr_ver_str, buffer); rc = _save_dbd_rec(fd, buffer); free_buf(buffer); if (rc != SLURM_SUCCESS) goto end_it; while ((buffer = list_dequeue(agent_list))) { /* * We do not want to store registration messages. If an * admin puts in an incorrect cluster name we can get a * deadlock unless they add the bogus cluster name to * the accounting system. */ offset = get_buf_offset(buffer); if (offset < 2) { free_buf(buffer); continue; } set_buf_offset(buffer, 0); (void) unpack16(&msg_type, buffer); /* checked by offset */ set_buf_offset(buffer, offset); if (msg_type == DBD_REGISTER_CTLD) { free_buf(buffer); continue; } rc = _save_dbd_rec(fd, buffer); free_buf(buffer); if (rc != SLURM_SUCCESS) break; wrote++; } } end_it: if (fd >= 0) { verbose("slurmdbd: saved %d pending RPCs", wrote); (void) close(fd); } xfree(dbd_fname); }
/* Load jobcomp data from save state file */ static int _load_pending_jobs(void) { int i, rc = SLURM_SUCCESS; char *saved_data = NULL, *state_file = NULL, *job_data = NULL; uint32_t data_size, job_cnt = 0, tmp32 = 0; Buf buffer; struct job_node *jnode; state_file = slurm_get_state_save_location(); if (state_file == NULL) { error("%s: Could not retrieve StateSaveLocation from conf", plugin_type); return SLURM_ERROR; } if (state_file[strlen(state_file) - 1] != '/') xstrcat(state_file, "/"); xstrcat(state_file, save_state_file); slurm_mutex_lock(&save_lock); data_size = _read_file(state_file, &saved_data); if ((data_size <= 0) || (saved_data == NULL)) { slurm_mutex_unlock(&save_lock); xfree(saved_data); xfree(state_file); return rc; } slurm_mutex_unlock(&save_lock); buffer = create_buf(saved_data, data_size); safe_unpack32(&job_cnt, buffer); for (i = 0; i < job_cnt; i++) { safe_unpackstr_xmalloc(&job_data, &tmp32, buffer); jnode = xmalloc(sizeof(struct job_node)); jnode->serialized_job = job_data; list_enqueue(jobslist, jnode); } if (job_cnt > 0) { if (slurm_get_debug_flags() & DEBUG_FLAG_ESEARCH) info("%s: Loaded %u jobs from state file", plugin_type, job_cnt); } free_buf(buffer); xfree(state_file); return rc; unpack_error: error("%s: Error unpacking file %s", plugin_type, state_file); free_buf(buffer); xfree(state_file); return SLURM_ERROR; }
/* Saves the state of all jobcomp data for further indexing retries */ static int _save_state(void) { int fd, rc = SLURM_SUCCESS; char *state_file, *new_file, *old_file; ListIterator iter; static int high_buffer_size = (1024 * 1024); Buf buffer = init_buf(high_buffer_size); uint32_t job_cnt; struct job_node *jnode; job_cnt = list_count(jobslist); pack32(job_cnt, buffer); iter = list_iterator_create(jobslist); while ((jnode = (struct job_node *)list_next(iter))) { packstr(jnode->serialized_job, buffer); } list_iterator_destroy(iter); state_file = slurm_get_state_save_location(); if (state_file == NULL || state_file[0] == '\0') { error("%s: Could not retrieve StateSaveLocation from conf", plugin_type); return SLURM_ERROR; } if (state_file[strlen(state_file) - 1] != '/') xstrcat(state_file, "/"); xstrcat(state_file, save_state_file); old_file = xstrdup(state_file); new_file = xstrdup(state_file); xstrcat(new_file, ".new"); xstrcat(old_file, ".old"); slurm_mutex_lock(&save_lock); fd = open(new_file, O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR); if (fd < 0) { error("%s: Can't save jobcomp state, open file %s error %m", plugin_type, new_file); rc = SLURM_ERROR; } else { int pos = 0, nwrite, amount, rc2; char *data; fd_set_close_on_exec(fd); nwrite = get_buf_offset(buffer); data = (char *) get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("%s: Error writing file %s, %m", plugin_type, new_file); rc = SLURM_ERROR; break; } nwrite -= amount; pos += amount; } if ((rc2 = fsync_and_close(fd, save_state_file))) rc = rc2; } if (rc == SLURM_ERROR) (void) unlink(new_file); else { (void) unlink(old_file); if (link(state_file, old_file)) { error("%s: Unable to create link for %s -> %s: %m", plugin_type, state_file, old_file); rc = SLURM_ERROR; } (void) unlink(state_file); if (link(new_file, state_file)) { error("%s: Unable to create link for %s -> %s: %m", plugin_type, new_file, state_file); rc = SLURM_ERROR; } (void) unlink(new_file); } xfree(old_file); xfree(state_file); xfree(new_file); slurm_mutex_unlock(&save_lock); free_buf(buffer); return rc; }
static void _load_dbd_state(void) { char *dbd_fname; Buf buffer; int fd, recovered = 0; uint16_t rpc_version = 0; dbd_fname = slurm_get_state_save_location(); xstrcat(dbd_fname, "/dbd.messages"); fd = open(dbd_fname, O_RDONLY); if (fd < 0) { /* don't print an error message if there is no file */ if (errno == ENOENT) debug4("slurmdbd: There is no state save file to " "open by name %s", dbd_fname); else error("slurmdbd: Opening state save file %s: %m", dbd_fname); } else { char *ver_str = NULL; uint32_t ver_str_len; buffer = _load_dbd_rec(fd); if (buffer == NULL) goto end_it; /* This is set to the end of the buffer for send so we need to set it back to 0 */ set_buf_offset(buffer, 0); safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); debug3("Version string in dbd_state header is %s", ver_str); unpack_error: free_buf(buffer); buffer = NULL; if (ver_str) { /* get the version after VER */ rpc_version = slurm_atoul(ver_str + 3); xfree(ver_str); } while (1) { /* If the buffer was not the VER%d string it was an actual message so we don't want to skip it. */ if (!buffer) buffer = _load_dbd_rec(fd); if (buffer == NULL) break; if (rpc_version != SLURM_PROTOCOL_VERSION) { /* unpack and repack with new * PROTOCOL_VERSION just so we keep * things up to date. */ slurmdbd_msg_t msg; int rc; set_buf_offset(buffer, 0); rc = unpack_slurmdbd_msg( &msg, rpc_version, buffer); free_buf(buffer); if (rc == SLURM_SUCCESS) buffer = pack_slurmdbd_msg( &msg, SLURM_PROTOCOL_VERSION); else buffer = NULL; } if (!buffer) { error("no buffer given"); continue; } if (!list_enqueue(agent_list, buffer)) fatal("slurmdbd: list_enqueue, no memory"); recovered++; buffer = NULL; } end_it: verbose("slurmdbd: recovered %d pending RPCs", recovered); (void) close(fd); } xfree(dbd_fname); }