Esempio n. 1
0
static void _save_dbd_state(void)
{
	char *dbd_fname;
	Buf buffer;
	int fd, rc, wrote = 0;
	uint16_t msg_type;
	uint32_t offset;

	dbd_fname = slurm_get_state_save_location();
	xstrcat(dbd_fname, "/dbd.messages");
	(void) unlink(dbd_fname);	/* clear save state */
	fd = open(dbd_fname, O_WRONLY | O_CREAT | O_TRUNC, 0600);
	if (fd < 0) {
		error("slurmdbd: Creating state save file %s", dbd_fname);
	} else if (agent_list && list_count(agent_list)) {
		char curr_ver_str[10];
		snprintf(curr_ver_str, sizeof(curr_ver_str),
			 "VER%d", SLURM_PROTOCOL_VERSION);
		buffer = init_buf(strlen(curr_ver_str));
		packstr(curr_ver_str, buffer);
		rc = _save_dbd_rec(fd, buffer);
		free_buf(buffer);
		if (rc != SLURM_SUCCESS)
			goto end_it;

		while ((buffer = list_dequeue(agent_list))) {
			/*
			 * We do not want to store registration messages. If an
			 * admin puts in an incorrect cluster name we can get a
			 * deadlock unless they add the bogus cluster name to
			 * the accounting system.
			 */
			offset = get_buf_offset(buffer);
			if (offset < 2) {
				free_buf(buffer);
				continue;
			}
			set_buf_offset(buffer, 0);
			(void) unpack16(&msg_type, buffer);  /* checked by offset */
			set_buf_offset(buffer, offset);
			if (msg_type == DBD_REGISTER_CTLD) {
				free_buf(buffer);
				continue;
			}

			rc = _save_dbd_rec(fd, buffer);
			free_buf(buffer);
			if (rc != SLURM_SUCCESS)
				break;
			wrote++;
		}
	}

end_it:
	if (fd >= 0) {
		verbose("slurmdbd: saved %d pending RPCs", wrote);
		(void) close(fd);
	}
	xfree(dbd_fname);
}
Esempio n. 2
0
/* Load jobcomp data from save state file */
static int _load_pending_jobs(void)
{
	int i, rc = SLURM_SUCCESS;
	char *saved_data = NULL, *state_file = NULL, *job_data = NULL;
	uint32_t data_size, job_cnt = 0, tmp32 = 0;
	Buf buffer;
	struct job_node *jnode;

	state_file = slurm_get_state_save_location();
	if (state_file == NULL) {
		error("%s: Could not retrieve StateSaveLocation from conf",
		      plugin_type);
		return SLURM_ERROR;
	}

	if (state_file[strlen(state_file) - 1] != '/')
		xstrcat(state_file, "/");
	xstrcat(state_file, save_state_file);

	slurm_mutex_lock(&save_lock);
	data_size = _read_file(state_file, &saved_data);
	if ((data_size <= 0) || (saved_data == NULL)) {
		slurm_mutex_unlock(&save_lock);
		xfree(saved_data);
		xfree(state_file);
		return rc;
	}
	slurm_mutex_unlock(&save_lock);

	buffer = create_buf(saved_data, data_size);
	safe_unpack32(&job_cnt, buffer);
	for (i = 0; i < job_cnt; i++) {
		safe_unpackstr_xmalloc(&job_data, &tmp32, buffer);
		jnode = xmalloc(sizeof(struct job_node));
		jnode->serialized_job = job_data;
		list_enqueue(jobslist, jnode);
	}
	if (job_cnt > 0) {
		if (slurm_get_debug_flags() & DEBUG_FLAG_ESEARCH)
			info("%s: Loaded %u jobs from state file", plugin_type,
			     job_cnt);
	}
	free_buf(buffer);
	xfree(state_file);

	return rc;

unpack_error:
	error("%s: Error unpacking file %s", plugin_type, state_file);
	free_buf(buffer);
	xfree(state_file);
	return SLURM_ERROR;
}
Esempio n. 3
0
/* Saves the state of all jobcomp data for further indexing retries */
static int _save_state(void)
{
	int fd, rc = SLURM_SUCCESS;
	char *state_file, *new_file, *old_file;
	ListIterator iter;
	static int high_buffer_size = (1024 * 1024);
	Buf buffer = init_buf(high_buffer_size);
	uint32_t job_cnt;
	struct job_node *jnode;

	job_cnt = list_count(jobslist);
	pack32(job_cnt, buffer);
	iter = list_iterator_create(jobslist);
	while ((jnode = (struct job_node *)list_next(iter))) {
		packstr(jnode->serialized_job, buffer);
	}
	list_iterator_destroy(iter);

	state_file = slurm_get_state_save_location();
	if (state_file == NULL || state_file[0] == '\0') {
		error("%s: Could not retrieve StateSaveLocation from conf",
		      plugin_type);
		return SLURM_ERROR;
	}

	if (state_file[strlen(state_file) - 1] != '/')
		xstrcat(state_file, "/");

	xstrcat(state_file, save_state_file);
	old_file = xstrdup(state_file);
	new_file = xstrdup(state_file);
	xstrcat(new_file, ".new");
	xstrcat(old_file, ".old");

	slurm_mutex_lock(&save_lock);
	fd = open(new_file, O_CREAT | O_WRONLY | O_TRUNC, S_IRUSR | S_IWUSR);
	if (fd < 0) {
		error("%s: Can't save jobcomp state, open file %s error %m",
		      plugin_type, new_file);
		rc = SLURM_ERROR;
	} else {
		int pos = 0, nwrite, amount, rc2;
		char *data;
		fd_set_close_on_exec(fd);
		nwrite = get_buf_offset(buffer);
		data = (char *) get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("%s: Error writing file %s, %m",
				      plugin_type, new_file);
				rc = SLURM_ERROR;
				break;
			}
			nwrite -= amount;
			pos += amount;
		}
		if ((rc2 = fsync_and_close(fd, save_state_file)))
			rc = rc2;
	}

	if (rc == SLURM_ERROR)
		(void) unlink(new_file);
	else {
		(void) unlink(old_file);
		if (link(state_file, old_file)) {
			error("%s: Unable to create link for %s -> %s: %m",
			      plugin_type, state_file, old_file);
			rc = SLURM_ERROR;
		}
		(void) unlink(state_file);
		if (link(new_file, state_file)) {
			error("%s: Unable to create link for %s -> %s: %m",
			      plugin_type, new_file, state_file);
			rc = SLURM_ERROR;
		}
		(void) unlink(new_file);
	}

	xfree(old_file);
	xfree(state_file);
	xfree(new_file);
	slurm_mutex_unlock(&save_lock);

	free_buf(buffer);

	return rc;
}
Esempio n. 4
0
static void _load_dbd_state(void)
{
	char *dbd_fname;
	Buf buffer;
	int fd, recovered = 0;
	uint16_t rpc_version = 0;

	dbd_fname = slurm_get_state_save_location();
	xstrcat(dbd_fname, "/dbd.messages");
	fd = open(dbd_fname, O_RDONLY);
	if (fd < 0) {
		/* don't print an error message if there is no file */
		if (errno == ENOENT)
			debug4("slurmdbd: There is no state save file to "
			       "open by name %s", dbd_fname);
		else
			error("slurmdbd: Opening state save file %s: %m",
			      dbd_fname);
	} else {
		char *ver_str = NULL;
		uint32_t ver_str_len;

		buffer = _load_dbd_rec(fd);
		if (buffer == NULL)
			goto end_it;
		/* This is set to the end of the buffer for send so we
		   need to set it back to 0 */
		set_buf_offset(buffer, 0);
		safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
		debug3("Version string in dbd_state header is %s", ver_str);
	unpack_error:
		free_buf(buffer);
		buffer = NULL;
		if (ver_str) {
			/* get the version after VER */
			rpc_version = slurm_atoul(ver_str + 3);
			xfree(ver_str);
		}

		while (1) {
			/* If the buffer was not the VER%d string it
			   was an actual message so we don't want to
			   skip it.
			*/
			if (!buffer)
				buffer = _load_dbd_rec(fd);
			if (buffer == NULL)
				break;
			if (rpc_version != SLURM_PROTOCOL_VERSION) {
				/* unpack and repack with new
				 * PROTOCOL_VERSION just so we keep
				 * things up to date.
				 */
				slurmdbd_msg_t msg;
				int rc;
				set_buf_offset(buffer, 0);
				rc = unpack_slurmdbd_msg(
					&msg, rpc_version, buffer);
				free_buf(buffer);
				if (rc == SLURM_SUCCESS)
					buffer = pack_slurmdbd_msg(
						&msg, SLURM_PROTOCOL_VERSION);
				else
					buffer = NULL;
			}
			if (!buffer) {
				error("no buffer given");
				continue;
			}
			if (!list_enqueue(agent_list, buffer))
				fatal("slurmdbd: list_enqueue, no memory");
			recovered++;
			buffer = NULL;
		}

	end_it:
		verbose("slurmdbd: recovered %d pending RPCs", recovered);
		(void) close(fd);
	}
	xfree(dbd_fname);
}