/* dump_all_part_state - save the state of all partitions to file */ int dump_all_part_state(void) { /* Save high-water mark to avoid buffer growth with copies */ static int high_buffer_size = BUF_SIZE; ListIterator part_iterator; struct part_record *part_ptr; int error_code = 0, log_fd; char *old_file, *new_file, *reg_file; /* Locks: Read partition */ slurmctld_lock_t part_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, READ_LOCK }; Buf buffer = init_buf(high_buffer_size); DEF_TIMERS; START_TIMER; /* write header: time */ packstr(PART_STATE_VERSION, buffer); pack_time(time(NULL), buffer); /* write partition records to buffer */ lock_slurmctld(part_read_lock); part_iterator = list_iterator_create(part_list); while ((part_ptr = (struct part_record *) list_next(part_iterator))) { xassert (part_ptr->magic == PART_MAGIC); _dump_part_state(part_ptr, buffer); } list_iterator_destroy(part_iterator); old_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(old_file, "/part_state.old"); reg_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(reg_file, "/part_state"); new_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(new_file, "/part_state.new"); unlock_slurmctld(part_read_lock); /* write the buffer to file */ lock_state_files(); log_fd = creat(new_file, 0600); if (log_fd < 0) { error("Can't save state, error creating file %s, %m", new_file); error_code = errno; } else { int pos = 0, nwrite = get_buf_offset(buffer), amount, rc; char *data = (char *)get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(log_fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", new_file); error_code = errno; break; } nwrite -= amount; pos += amount; } rc = fsync_and_close(log_fd, "partition"); if (rc && !error_code) error_code = rc; } if (error_code) (void) unlink(new_file); else { /* file shuffle */ (void) unlink(old_file); if (link(reg_file, old_file)) { debug4("unable to create link for %s -> %s: %m", reg_file, old_file); } (void) unlink(reg_file); if (link(new_file, reg_file)) { debug4("unable to create link for %s -> %s: %m", new_file, reg_file); } (void) unlink(new_file); } xfree(old_file); xfree(reg_file); xfree(new_file); unlock_state_files(); free_buf(buffer); END_TIMER2("dump_all_part_state"); return 0; }
/* Saves the state of all jobcomp data for further indexing retries */ static int _save_state(void) { int fd, rc = SLURM_SUCCESS; char *state_file, *new_file, *old_file; ListIterator iter; static int high_buffer_size = (1024 * 1024); Buf buffer = init_buf(high_buffer_size); uint32_t job_cnt; struct job_node *jnode; job_cnt = list_count(jobslist); pack32(job_cnt, buffer); iter = list_iterator_create(jobslist); while ((jnode = (struct job_node *)list_next(iter))) { packstr(jnode->serialized_job, buffer); } list_iterator_destroy(iter); state_file = slurm_get_state_save_location(); if (state_file == NULL || state_file[0] == '\0') { error("%s: Could not retrieve StateSaveLocation from conf", plugin_type); return SLURM_ERROR; } if (state_file[strlen(state_file) - 1] != '/') xstrcat(state_file, "/"); xstrcat(state_file, save_state_file); old_file = xstrdup(state_file); new_file = xstrdup(state_file); xstrcat(new_file, ".new"); xstrcat(old_file, ".old"); slurm_mutex_lock(&save_lock); fd = open(new_file, O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR); if (fd < 0) { error("%s: Can't save jobcomp state, open file %s error %m", plugin_type, new_file); rc = SLURM_ERROR; } else { int pos = 0, nwrite, amount, rc2; char *data; fd_set_close_on_exec(fd); nwrite = get_buf_offset(buffer); data = (char *) get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("%s: Error writing file %s, %m", plugin_type, new_file); rc = SLURM_ERROR; break; } nwrite -= amount; pos += amount; } if ((rc2 = fsync_and_close(fd, save_state_file))) rc = rc2; } if (rc == SLURM_ERROR) (void) unlink(new_file); else { (void) unlink(old_file); if (link(state_file, old_file)) { error("%s: Unable to create link for %s -> %s: %m", plugin_type, state_file, old_file); rc = SLURM_ERROR; } (void) unlink(state_file); if (link(new_file, state_file)) { error("%s: Unable to create link for %s -> %s: %m", plugin_type, new_file, state_file); rc = SLURM_ERROR; } (void) unlink(new_file); } xfree(old_file); xfree(state_file); xfree(new_file); slurm_mutex_unlock(&save_lock); free_buf(buffer); return rc; }
static void *_heartbeat_thread(void *no_data) { /* * The frequency needs to be faster than slurmctld_timeout, * or the backup controller may try to assume control. * One-fourth is very conservative, one-half should be sufficient. * Have it happen at least every 30 seconds if the timeout is quite * large. */ int beat = MIN(slurmctld_conf.slurmctld_timeout / 4, 30); time_t now; uint64_t nl; struct timespec ts = {0, 0}; char *reg_file, *new_file; int fd; debug("Heartbeat thread started, beating every %d seconds.", beat); slurm_mutex_lock(&heartbeat_mutex); while (heart_beating) { now = time(NULL); ts.tv_sec = now + beat; debug3("Heartbeat at %ld", now); /* * Rebuild file path each beat just in case someone changes * StateSaveLocation and runs reconfigure. */ reg_file = xstrdup_printf("%s/heartbeat", slurmctld_conf.state_save_location); new_file = xstrdup_printf("%s.new", reg_file); nl = HTON_uint64((uint64_t) now); fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600); if (fd < 0) { error("%s: heartbeat file creation failed to %s.", __func__, new_file); goto delay; } if (write(fd, &nl, sizeof(uint64_t)) != sizeof(uint64_t)) { error("%s: heartbeat write failed to %s.", __func__, new_file); close(fd); (void) unlink(new_file); goto delay; } if (write(fd, &backup_inx, sizeof(int)) != sizeof(int)) { error("%s: heartbeat write failed to %s.", __func__, new_file); close(fd); (void) unlink(new_file); goto delay; } if (fsync_and_close(fd, "heartbeat")) { (void) unlink(new_file); goto delay; } /* shuffle files around */ (void) unlink(reg_file); if (link(new_file, reg_file)) debug("%s: unable to create link for %s -> %s, %m", __func__, new_file, reg_file); (void) unlink(new_file); delay: xfree(reg_file); xfree(new_file); slurm_cond_timedwait(&heartbeat_cond, &heartbeat_mutex, &ts); } slurm_mutex_unlock(&heartbeat_mutex); return NULL; }
/* dump_all_front_end_state - save the state of all front_end nodes to file */ extern int dump_all_front_end_state(void) { #ifdef HAVE_FRONT_END /* Save high-water mark to avoid buffer growth with copies */ static int high_buffer_size = (1024 * 1024); int error_code = 0, i, log_fd; char *old_file, *new_file, *reg_file; front_end_record_t *front_end_ptr; /* Locks: Read config and node */ slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK }; Buf buffer = init_buf(high_buffer_size); DEF_TIMERS; START_TIMER; /* write header: version, time */ packstr(FRONT_END_STATE_VERSION, buffer); pack_time(time(NULL), buffer); /* write node records to buffer */ lock_slurmctld (node_read_lock); for (i = 0, front_end_ptr = front_end_nodes; i < front_end_node_cnt; i++, front_end_ptr++) { xassert(front_end_ptr->magic == FRONT_END_MAGIC); _dump_front_end_state(front_end_ptr, buffer); } old_file = xstrdup (slurmctld_conf.state_save_location); xstrcat (old_file, "/front_end_state.old"); reg_file = xstrdup (slurmctld_conf.state_save_location); xstrcat (reg_file, "/front_end_state"); new_file = xstrdup (slurmctld_conf.state_save_location); xstrcat (new_file, "/front_end_state.new"); unlock_slurmctld (node_read_lock); /* write the buffer to file */ lock_state_files(); log_fd = creat (new_file, 0600); if (log_fd < 0) { error ("Can't save state, error creating file %s %m", new_file); error_code = errno; } else { int pos = 0, nwrite = get_buf_offset(buffer), amount, rc; char *data = (char *)get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(log_fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", new_file); error_code = errno; break; } nwrite -= amount; pos += amount; } rc = fsync_and_close(log_fd, "front_end"); if (rc && !error_code) error_code = rc; } if (error_code) (void) unlink (new_file); else { /* file shuffle */ (void) unlink (old_file); if (link(reg_file, old_file)) debug4("unable to create link for %s -> %s: %m", reg_file, old_file); (void) unlink (reg_file); if (link(new_file, reg_file)) debug4("unable to create link for %s -> %s: %m", new_file, reg_file); (void) unlink (new_file); } xfree (old_file); xfree (reg_file); xfree (new_file); unlock_state_files (); free_buf (buffer); END_TIMER2("dump_all_front_end_state"); return error_code; #else return SLURM_SUCCESS; #endif }
static void _dump_sicp_state(void) { char *old_file, *new_file, *reg_file; ListIterator sicp_iterator; sicp_job_t *sicp_ptr; Buf buffer; time_t now = time(NULL); int error_code = SLURM_SUCCESS, len, log_fd; pthread_mutex_lock(&sicp_lock); len = list_count(sicp_job_list) * 4 + 128; buffer = init_buf(len); packstr("PROTOCOL_VERSION", buffer); pack16(SLURM_PROTOCOL_VERSION, buffer); pack_time(now, buffer); sicp_iterator = list_iterator_create(sicp_job_list); while ((sicp_ptr = (sicp_job_t *) list_next(sicp_iterator))) { pack32(sicp_ptr->job_id, buffer); pack16(sicp_ptr->job_state, buffer); } list_iterator_destroy(sicp_iterator); pthread_mutex_unlock(&sicp_lock); old_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(old_file, "/sicp_state.old"); reg_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(reg_file, "/sicp_state"); new_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(new_file, "/sicp_state.new"); lock_state_files(); log_fd = creat(new_file, 0600); if (log_fd < 0) { error("Can't save state, create file %s error %m", new_file); error_code = errno; } else { int pos = 0, nwrite, amount, rc; char *data; fd_set_close_on_exec(log_fd); nwrite = get_buf_offset(buffer); data = (char *)get_buf_data(buffer); while (nwrite > 0) { amount = write(log_fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", new_file); error_code = errno; break; } nwrite -= amount; pos += amount; } rc = fsync_and_close(log_fd, "sicp"); if (rc && !error_code) error_code = rc; } if (error_code) { (void) unlink(new_file); } else { /* file shuffle */ (void) unlink(old_file); if (link(reg_file, old_file)) debug4("unable to create link for %s -> %s: %m", reg_file, old_file); (void) unlink(reg_file); if (link(new_file, reg_file)) debug4("unable to create link for %s -> %s: %m", new_file, reg_file); (void) unlink(new_file); } xfree(old_file); xfree(reg_file); xfree(new_file); unlock_state_files(); free_buf(buffer); }