Exemple #1
0
/* dump_all_part_state - save the state of all partitions to file */
int dump_all_part_state(void)
{
	/* Save high-water mark to avoid buffer growth with copies */
	static int high_buffer_size = BUF_SIZE;
	ListIterator part_iterator;
	struct part_record *part_ptr;
	int error_code = 0, log_fd;
	char *old_file, *new_file, *reg_file;
	/* Locks: Read partition */
	slurmctld_lock_t part_read_lock =
	    { READ_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
	Buf buffer = init_buf(high_buffer_size);
	DEF_TIMERS;

	START_TIMER;
	/* write header: time */
	packstr(PART_STATE_VERSION, buffer);
	pack_time(time(NULL), buffer);

	/* write partition records to buffer */
	lock_slurmctld(part_read_lock);
	part_iterator = list_iterator_create(part_list);
	while ((part_ptr = (struct part_record *) list_next(part_iterator))) {
		xassert (part_ptr->magic == PART_MAGIC);
		_dump_part_state(part_ptr, buffer);
	}
	list_iterator_destroy(part_iterator);

	old_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(old_file, "/part_state.old");
	reg_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(reg_file, "/part_state");
	new_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(new_file, "/part_state.new");
	unlock_slurmctld(part_read_lock);

	/* write the buffer to file */
	lock_state_files();
	log_fd = creat(new_file, 0600);
	if (log_fd < 0) {
		error("Can't save state, error creating file %s, %m",
		      new_file);
		error_code = errno;
	} else {
		int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
		char *data = (char *)get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(log_fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("Error writing file %s, %m", new_file);
				error_code = errno;
				break;
			}
			nwrite -= amount;
			pos    += amount;
		}

		rc = fsync_and_close(log_fd, "partition");
		if (rc && !error_code)
			error_code = rc;
	}
	if (error_code)
		(void) unlink(new_file);
	else {			/* file shuffle */
		(void) unlink(old_file);
		if (link(reg_file, old_file)) {
			debug4("unable to create link for %s -> %s: %m",
			       reg_file, old_file);
		}
		(void) unlink(reg_file);
		if (link(new_file, reg_file)) {
			debug4("unable to create link for %s -> %s: %m",
			       new_file, reg_file);
		}
		(void) unlink(new_file);
	}
	xfree(old_file);
	xfree(reg_file);
	xfree(new_file);
	unlock_state_files();

	free_buf(buffer);
	END_TIMER2("dump_all_part_state");
	return 0;
}
/* Saves the state of all jobcomp data for further indexing retries */
static int _save_state(void)
{
	int fd, rc = SLURM_SUCCESS;
	char *state_file, *new_file, *old_file;
	ListIterator iter;
	static int high_buffer_size = (1024 * 1024);
	Buf buffer = init_buf(high_buffer_size);
	uint32_t job_cnt;
	struct job_node *jnode;

	job_cnt = list_count(jobslist);
	pack32(job_cnt, buffer);
	iter = list_iterator_create(jobslist);
	while ((jnode = (struct job_node *)list_next(iter))) {
		packstr(jnode->serialized_job, buffer);
	}
	list_iterator_destroy(iter);

	state_file = slurm_get_state_save_location();
	if (state_file == NULL || state_file[0] == '\0') {
		error("%s: Could not retrieve StateSaveLocation from conf",
		      plugin_type);
		return SLURM_ERROR;
	}

	if (state_file[strlen(state_file) - 1] != '/')
		xstrcat(state_file, "/");

	xstrcat(state_file, save_state_file);
	old_file = xstrdup(state_file);
	new_file = xstrdup(state_file);
	xstrcat(new_file, ".new");
	xstrcat(old_file, ".old");

	slurm_mutex_lock(&save_lock);
	fd = open(new_file, O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR);
	if (fd < 0) {
		error("%s: Can't save jobcomp state, open file %s error %m",
		      plugin_type, new_file);
		rc = SLURM_ERROR;
	} else {
		int pos = 0, nwrite, amount, rc2;
		char *data;
		fd_set_close_on_exec(fd);
		nwrite = get_buf_offset(buffer);
		data = (char *) get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("%s: Error writing file %s, %m",
				      plugin_type, new_file);
				rc = SLURM_ERROR;
				break;
			}
			nwrite -= amount;
			pos += amount;
		}
		if ((rc2 = fsync_and_close(fd, save_state_file)))
			rc = rc2;
	}

	if (rc == SLURM_ERROR)
		(void) unlink(new_file);
	else {
		(void) unlink(old_file);
		if (link(state_file, old_file)) {
			error("%s: Unable to create link for %s -> %s: %m",
			      plugin_type, state_file, old_file);
			rc = SLURM_ERROR;
		}
		(void) unlink(state_file);
		if (link(new_file, state_file)) {
			error("%s: Unable to create link for %s -> %s: %m",
			      plugin_type, new_file, state_file);
			rc = SLURM_ERROR;
		}
		(void) unlink(new_file);
	}

	xfree(old_file);
	xfree(state_file);
	xfree(new_file);
	slurm_mutex_unlock(&save_lock);

	free_buf(buffer);

	return rc;
}
Exemple #3
0
static void *_heartbeat_thread(void *no_data)
{
	/*
	 * The frequency needs to be faster than slurmctld_timeout,
	 * or the backup controller may try to assume control.
	 * One-fourth is very conservative, one-half should be sufficient.
	 * Have it happen at least every 30 seconds if the timeout is quite
	 * large.
	 */
	int beat = MIN(slurmctld_conf.slurmctld_timeout / 4, 30);
	time_t now;
	uint64_t nl;
	struct timespec ts = {0, 0};
	char *reg_file, *new_file;
	int fd;

	debug("Heartbeat thread started, beating every %d seconds.", beat);

	slurm_mutex_lock(&heartbeat_mutex);
	while (heart_beating) {
		now = time(NULL);
		ts.tv_sec = now + beat;

		debug3("Heartbeat at %ld", now);
		/*
		 * Rebuild file path each beat just in case someone changes
		 * StateSaveLocation and runs reconfigure.
		 */
		reg_file = xstrdup_printf("%s/heartbeat",
					  slurmctld_conf.state_save_location);
		new_file = xstrdup_printf("%s.new", reg_file);

		nl = HTON_uint64((uint64_t) now);

		fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600);
		if (fd < 0) {
			error("%s: heartbeat file creation failed to %s.",
			      __func__, new_file);
			goto delay;
		}

		if (write(fd, &nl, sizeof(uint64_t)) != sizeof(uint64_t)) {
			error("%s: heartbeat write failed to %s.",
			      __func__, new_file);
			close(fd);
			(void) unlink(new_file);
			goto delay;
		}
		if (write(fd, &backup_inx, sizeof(int)) != sizeof(int)) {
			error("%s: heartbeat write failed to %s.",
			      __func__, new_file);
			close(fd);
			(void) unlink(new_file);
			goto delay;
		}

		if (fsync_and_close(fd, "heartbeat")) {
			(void) unlink(new_file);
			goto delay;
		}

		/* shuffle files around */
		(void) unlink(reg_file);
		if (link(new_file, reg_file))
			debug("%s: unable to create link for %s -> %s, %m",
			      __func__, new_file, reg_file);
		(void) unlink(new_file);

delay:
		xfree(reg_file);
		xfree(new_file);
		slurm_cond_timedwait(&heartbeat_cond, &heartbeat_mutex, &ts);
	}
	slurm_mutex_unlock(&heartbeat_mutex);

	return NULL;
}
Exemple #4
0
/* dump_all_front_end_state - save the state of all front_end nodes to file */
extern int dump_all_front_end_state(void)
{
#ifdef HAVE_FRONT_END
	/* Save high-water mark to avoid buffer growth with copies */
	static int high_buffer_size = (1024 * 1024);
	int error_code = 0, i, log_fd;
	char *old_file, *new_file, *reg_file;
	front_end_record_t *front_end_ptr;
	/* Locks: Read config and node */
	slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK,
					    NO_LOCK };
	Buf buffer = init_buf(high_buffer_size);
	DEF_TIMERS;

	START_TIMER;
	/* write header: version, time */
	packstr(FRONT_END_STATE_VERSION, buffer);
	pack_time(time(NULL), buffer);

	/* write node records to buffer */
	lock_slurmctld (node_read_lock);

	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		xassert(front_end_ptr->magic == FRONT_END_MAGIC);
		_dump_front_end_state(front_end_ptr, buffer);
	}

	old_file = xstrdup (slurmctld_conf.state_save_location);
	xstrcat (old_file, "/front_end_state.old");
	reg_file = xstrdup (slurmctld_conf.state_save_location);
	xstrcat (reg_file, "/front_end_state");
	new_file = xstrdup (slurmctld_conf.state_save_location);
	xstrcat (new_file, "/front_end_state.new");
	unlock_slurmctld (node_read_lock);

	/* write the buffer to file */
	lock_state_files();
	log_fd = creat (new_file, 0600);
	if (log_fd < 0) {
		error ("Can't save state, error creating file %s %m", new_file);
		error_code = errno;
	} else {
		int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
		char *data = (char *)get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(log_fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("Error writing file %s, %m", new_file);
				error_code = errno;
				break;
			}
			nwrite -= amount;
			pos    += amount;
		}

		rc = fsync_and_close(log_fd, "front_end");
		if (rc && !error_code)
			error_code = rc;
	}
	if (error_code)
		(void) unlink (new_file);
	else {	/* file shuffle */
		(void) unlink (old_file);
		if (link(reg_file, old_file))
			debug4("unable to create link for %s -> %s: %m",
			       reg_file, old_file);
		(void) unlink (reg_file);
		if (link(new_file, reg_file))
			debug4("unable to create link for %s -> %s: %m",
			       new_file, reg_file);
		(void) unlink (new_file);
	}
	xfree (old_file);
	xfree (reg_file);
	xfree (new_file);
	unlock_state_files ();

	free_buf (buffer);
	END_TIMER2("dump_all_front_end_state");
	return error_code;
#else
	return SLURM_SUCCESS;
#endif
}
Exemple #5
0
static void _dump_sicp_state(void)
{
    char *old_file, *new_file, *reg_file;
    ListIterator sicp_iterator;
    sicp_job_t *sicp_ptr;
    Buf buffer;
    time_t now = time(NULL);
    int error_code = SLURM_SUCCESS, len, log_fd;

    pthread_mutex_lock(&sicp_lock);
    len = list_count(sicp_job_list) * 4 + 128;
    buffer = init_buf(len);

    packstr("PROTOCOL_VERSION", buffer);
    pack16(SLURM_PROTOCOL_VERSION, buffer);
    pack_time(now, buffer);

    sicp_iterator = list_iterator_create(sicp_job_list);
    while ((sicp_ptr = (sicp_job_t *) list_next(sicp_iterator))) {
        pack32(sicp_ptr->job_id, buffer);
        pack16(sicp_ptr->job_state, buffer);
    }
    list_iterator_destroy(sicp_iterator);
    pthread_mutex_unlock(&sicp_lock);

    old_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(old_file, "/sicp_state.old");
    reg_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(reg_file, "/sicp_state");
    new_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(new_file, "/sicp_state.new");

    lock_state_files();
    log_fd = creat(new_file, 0600);
    if (log_fd < 0) {
        error("Can't save state, create file %s error %m",
              new_file);
        error_code = errno;
    } else {
        int pos = 0, nwrite, amount, rc;
        char *data;

        fd_set_close_on_exec(log_fd);
        nwrite = get_buf_offset(buffer);
        data = (char *)get_buf_data(buffer);
        while (nwrite > 0) {
            amount = write(log_fd, &data[pos], nwrite);
            if ((amount < 0) && (errno != EINTR)) {
                error("Error writing file %s, %m", new_file);
                error_code = errno;
                break;
            }
            nwrite -= amount;
            pos    += amount;
        }

        rc = fsync_and_close(log_fd, "sicp");
        if (rc && !error_code)
            error_code = rc;
    }
    if (error_code) {
        (void) unlink(new_file);
    } else {			/* file shuffle */
        (void) unlink(old_file);
        if (link(reg_file, old_file))
            debug4("unable to create link for %s -> %s: %m",
                   reg_file, old_file);
        (void) unlink(reg_file);
        if (link(new_file, reg_file))
            debug4("unable to create link for %s -> %s: %m",
                   new_file, reg_file);
        (void) unlink(new_file);
    }
    xfree(old_file);
    xfree(reg_file);
    xfree(new_file);
    unlock_state_files();

    free_buf(buffer);
}