Example #1
0
/*
 * load_all_part_state - load the partition state from file, recover on
 *	slurmctld restart. execute this after loading the configuration
 *	file data.
 * NOTE: READ lock_slurmctld config before entry
 */
int load_all_part_state(void)
{
	char *part_name = NULL, *allow_groups = NULL, *nodes = NULL;
	char *state_file, *data = NULL;
	uint32_t max_time, default_time, max_nodes, min_nodes;
	uint32_t max_cpus_per_node = INFINITE, grace_time = 0;
	time_t time;
	uint16_t flags;
	uint16_t max_share, preempt_mode, priority, state_up, cr_type;
	struct part_record *part_ptr;
	uint32_t data_size = 0, name_len;
	int data_allocated, data_read = 0, error_code = 0, part_cnt = 0;
	int state_fd;
	Buf buffer;
	char *ver_str = NULL;
	char* allow_alloc_nodes = NULL;
	uint16_t protocol_version = (uint16_t)NO_VAL;
	char* alternate = NULL;

	/* read the file */
	lock_state_files();
	state_fd = _open_part_state_file(&state_file);
	if (state_fd < 0) {
		info("No partition state file (%s) to recover",
		     state_file);
		error_code = ENOENT;
	} else {
		data_allocated = BUF_SIZE;
		data = xmalloc(data_allocated);
		while (1) {
			data_read = read(state_fd, &data[data_size],
					 BUF_SIZE);
			if (data_read < 0) {
				if  (errno == EINTR)
					continue;
				else {
					error("Read error on %s: %m",
						state_file);
					break;
				}
			} else if (data_read == 0)     /* eof */
				break;
			data_size      += data_read;
			data_allocated += data_read;
			xrealloc(data, data_allocated);
		}
		close(state_fd);
	}
	xfree(state_file);
	unlock_state_files();

	buffer = create_buf(data, data_size);

	safe_unpackstr_xmalloc( &ver_str, &name_len, buffer);
	debug3("Version string in part_state header is %s", ver_str);
	if (ver_str) {
		if (!strcmp(ver_str, PART_STATE_VERSION)) {
			protocol_version = SLURM_PROTOCOL_VERSION;
		} else if (!strcmp(ver_str, PART_2_5_STATE_VERSION)) {
			protocol_version = SLURM_2_5_PROTOCOL_VERSION;
		}
	}

	if (protocol_version == (uint16_t)NO_VAL) {
		error("**********************************************************");
		error("Can not recover partition state, data version incompatible");
		error("**********************************************************");
		xfree(ver_str);
		free_buf(buffer);
		return EFAULT;
	}
	xfree(ver_str);
	safe_unpack_time(&time, buffer);

	while (remaining_buf(buffer) > 0) {
		if (protocol_version >= SLURM_2_6_PROTOCOL_VERSION) {
			safe_unpackstr_xmalloc(&part_name, &name_len, buffer);
			safe_unpack32(&grace_time, buffer);
			safe_unpack32(&max_time, buffer);
			safe_unpack32(&default_time, buffer);
			safe_unpack32(&max_cpus_per_node, buffer);
			safe_unpack32(&max_nodes, buffer);
			safe_unpack32(&min_nodes, buffer);

			safe_unpack16(&flags,        buffer);
			safe_unpack16(&max_share,    buffer);
			safe_unpack16(&preempt_mode, buffer);
			safe_unpack16(&priority,     buffer);

			if (priority > part_max_priority)
				part_max_priority = priority;

			safe_unpack16(&state_up, buffer);
			safe_unpack16(&cr_type, buffer);

			safe_unpackstr_xmalloc(&allow_groups,
					       &name_len, buffer);
			safe_unpackstr_xmalloc(&allow_alloc_nodes,
					       &name_len, buffer);
			safe_unpackstr_xmalloc(&alternate, &name_len, buffer);
			safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
			if ((flags & PART_FLAG_DEFAULT_CLR) ||
			    (flags & PART_FLAG_HIDDEN_CLR)  ||
			    (flags & PART_FLAG_NO_ROOT_CLR) ||
			    (flags & PART_FLAG_ROOT_ONLY_CLR) ||
			    (flags & PART_FLAG_REQ_RESV_CLR)) {
				error("Invalid data for partition %s: flags=%u",
				      part_name, flags);
				error_code = EINVAL;
			}
		} else if (protocol_version >= SLURM_2_4_PROTOCOL_VERSION) {
			safe_unpackstr_xmalloc(&part_name, &name_len, buffer);
			safe_unpack32(&grace_time, buffer);
			safe_unpack32(&max_time, buffer);
			safe_unpack32(&default_time, buffer);
			safe_unpack32(&max_nodes, buffer);
			safe_unpack32(&min_nodes, buffer);

			safe_unpack16(&flags,        buffer);
			safe_unpack16(&max_share,    buffer);
			safe_unpack16(&preempt_mode, buffer);
			safe_unpack16(&priority,     buffer);

			if (priority > part_max_priority)
				part_max_priority = priority;
			cr_type = 0;	/* Default value */

			safe_unpack16(&state_up, buffer);
			safe_unpackstr_xmalloc(&allow_groups,
					       &name_len, buffer);
			safe_unpackstr_xmalloc(&allow_alloc_nodes,
					       &name_len, buffer);
			safe_unpackstr_xmalloc(&alternate, &name_len, buffer);
			safe_unpackstr_xmalloc(&nodes, &name_len, buffer);
			if ((flags & PART_FLAG_DEFAULT_CLR) ||
			    (flags & PART_FLAG_HIDDEN_CLR)  ||
			    (flags & PART_FLAG_NO_ROOT_CLR) ||
			    (flags & PART_FLAG_ROOT_ONLY_CLR) ||
			    (flags & PART_FLAG_REQ_RESV_CLR)) {
				error("Invalid data for partition %s: flags=%u",
				      part_name, flags);
				error_code = EINVAL;
			}
		} else {
			error("load_all_part_state: protocol_version "
			      "%hu not supported", protocol_version);
			goto unpack_error;
		}
		/* validity test as possible */
		if (state_up > PARTITION_UP) {
			error("Invalid data for partition %s: state_up=%u",
			      part_name, state_up);
			error_code = EINVAL;
		}
		if (error_code) {
			error("No more partition data will be processed from "
			      "the checkpoint file");
			xfree(allow_groups);
			xfree(allow_alloc_nodes);
			xfree(alternate);
			xfree(part_name);
			xfree(nodes);
			error_code = EINVAL;
			break;
		}

		/* find record and perform update */
		part_ptr = list_find_first(part_list, &list_find_part,
					   part_name);
		part_cnt++;
		if (part_ptr == NULL) {
			info("load_all_part_state: partition %s missing from "
				"configuration file", part_name);
			part_ptr = create_part_record();
			xfree(part_ptr->name);
			part_ptr->name = xstrdup(part_name);
		}

		part_ptr->flags          = flags;
		if (part_ptr->flags & PART_FLAG_DEFAULT) {
			xfree(default_part_name);
			default_part_name = xstrdup(part_name);
			default_part_loc = part_ptr;
		}
		part_ptr->max_time       = max_time;
		part_ptr->default_time   = default_time;
		part_ptr->max_cpus_per_node = max_cpus_per_node;
		part_ptr->max_nodes      = max_nodes;
		part_ptr->max_nodes_orig = max_nodes;
		part_ptr->min_nodes      = min_nodes;
		part_ptr->min_nodes_orig = min_nodes;
		part_ptr->max_share      = max_share;
		part_ptr->grace_time     = grace_time;
		if (preempt_mode != (uint16_t) NO_VAL)
			part_ptr->preempt_mode   = preempt_mode;
		part_ptr->priority       = priority;
		part_ptr->state_up       = state_up;
		part_ptr->cr_type	 = cr_type;
		xfree(part_ptr->allow_groups);
		part_ptr->allow_groups   = allow_groups;
		xfree(part_ptr->allow_alloc_nodes);
		part_ptr->allow_alloc_nodes   = allow_alloc_nodes;
		xfree(part_ptr->alternate);
		part_ptr->alternate      = alternate;
		xfree(part_ptr->nodes);
		part_ptr->nodes = nodes;

		xfree(part_name);
	}

	info("Recovered state of %d partitions", part_cnt);
	free_buf(buffer);
	return error_code;

      unpack_error:
	error("Incomplete partition data checkpoint file");
	info("Recovered state of %d partitions", part_cnt);
	free_buf(buffer);
	return EFAULT;
}
Example #2
0
/*
 * load_all_front_end_state - Load the front_end node state from file, recover
 *	on slurmctld restart. Execute this after loading the configuration
 *	file data. Data goes into common storage.
 * IN state_only - if true, overwrite only front_end node state and reason
 *	Use this to overwrite the "UNKNOWN state typically used in slurm.conf
 * RET 0 or error code
 * NOTE: READ lock_slurmctld config before entry
 */
extern int load_all_front_end_state(bool state_only)
{
#ifdef HAVE_FRONT_END
	char *node_name = NULL, *reason = NULL, *data = NULL, *state_file;
	int data_allocated, data_read = 0, error_code = 0, node_cnt = 0;
	uint16_t node_state;
	uint32_t data_size = 0, name_len;
	uint32_t reason_uid = NO_VAL;
	time_t reason_time = 0;
	front_end_record_t *front_end_ptr;
	int state_fd;
	time_t time_stamp;
	Buf buffer;
	char *ver_str = NULL;
	uint16_t protocol_version = (uint16_t) NO_VAL;

	/* read the file */
	lock_state_files ();
	state_fd = _open_front_end_state_file(&state_file);
	if (state_fd < 0) {
		info ("No node state file (%s) to recover", state_file);
		error_code = ENOENT;
	} else {
		data_allocated = BUF_SIZE;
		data = xmalloc(data_allocated);
		while (1) {
			data_read = read(state_fd, &data[data_size], BUF_SIZE);
			if (data_read < 0) {
				if (errno == EINTR)
					continue;
				else {
					error ("Read error on %s: %m",
						state_file);
					break;
				}
			} else if (data_read == 0)     /* eof */
				break;
			data_size      += data_read;
			data_allocated += data_read;
			xrealloc(data, data_allocated);
		}
		close (state_fd);
	}
	xfree (state_file);
	unlock_state_files ();

	buffer = create_buf (data, data_size);

	safe_unpackstr_xmalloc( &ver_str, &name_len, buffer);
	debug3("Version string in front_end_state header is %s", ver_str);
	if (ver_str) {
		if (!strcmp(ver_str, FRONT_END_STATE_VERSION)) {
			protocol_version = SLURM_PROTOCOL_VERSION;
		}
	}

	if (protocol_version == (uint16_t) NO_VAL) {
		error("*****************************************************");
		error("Can not recover front_end state, version incompatible");
		error("*****************************************************");
		xfree(ver_str);
		free_buf(buffer);
		return EFAULT;
	}
	xfree(ver_str);

	safe_unpack_time(&time_stamp, buffer);

	while (remaining_buf (buffer) > 0) {
		uint16_t base_state;
		if (protocol_version >= SLURM_2_5_PROTOCOL_VERSION) {
			safe_unpackstr_xmalloc (&node_name, &name_len, buffer);
			safe_unpack16 (&node_state,  buffer);
			safe_unpackstr_xmalloc (&reason,    &name_len, buffer);
			safe_unpack_time (&reason_time, buffer);
			safe_unpack32 (&reason_uid,  buffer);
			base_state = node_state & NODE_STATE_BASE;
		} else
			goto unpack_error;

		/* validity test as possible */

		/* find record and perform update */
		front_end_ptr = find_front_end_record(node_name);
		if (front_end_ptr == NULL) {
			error("Front_end node %s has vanished from "
			      "configuration", node_name);
		} else if (state_only) {
			uint16_t orig_flags;
			orig_flags = front_end_ptr->node_state &
				     NODE_STATE_FLAGS;
			node_cnt++;
			if (IS_NODE_UNKNOWN(front_end_ptr)) {
				if (base_state == NODE_STATE_DOWN) {
					orig_flags &= (~NODE_STATE_COMPLETING);
					front_end_ptr->node_state =
						NODE_STATE_DOWN | orig_flags;
				}
				if (node_state & NODE_STATE_DRAIN) {
					 front_end_ptr->node_state |=
						 NODE_STATE_DRAIN;
				}
				if (node_state & NODE_STATE_FAIL) {
					front_end_ptr->node_state |=
						NODE_STATE_FAIL;
				}
			}
			if (front_end_ptr->reason == NULL) {
				front_end_ptr->reason = reason;
				reason = NULL;	/* Nothing to free */
				front_end_ptr->reason_time = reason_time;
				front_end_ptr->reason_uid = reason_uid;
			}
		} else {
			node_cnt++;
			front_end_ptr->node_state = node_state;
			xfree(front_end_ptr->reason);
			front_end_ptr->reason	= reason;
			reason			= NULL;	/* Nothing to free */
			front_end_ptr->reason_time	= reason_time;
			front_end_ptr->reason_uid	= reason_uid;
			front_end_ptr->last_response	= (time_t) 0;
		}

		xfree(node_name);
		xfree(reason);
	}

fini:	info("Recovered state of %d front_end nodes", node_cnt);
	free_buf (buffer);
	return error_code;

unpack_error:
	error("Incomplete front_end node data checkpoint file");
	error_code = EFAULT;
	xfree (node_name);
	xfree(reason);
	goto fini;
#else
	return 0;
#endif
}
Example #3
0
/* dump_all_part_state - save the state of all partitions to file */
int dump_all_part_state(void)
{
	/* Save high-water mark to avoid buffer growth with copies */
	static int high_buffer_size = BUF_SIZE;
	ListIterator part_iterator;
	struct part_record *part_ptr;
	int error_code = 0, log_fd;
	char *old_file, *new_file, *reg_file;
	/* Locks: Read partition */
	slurmctld_lock_t part_read_lock =
	    { READ_LOCK, NO_LOCK, NO_LOCK, READ_LOCK };
	Buf buffer = init_buf(high_buffer_size);
	DEF_TIMERS;

	START_TIMER;
	/* write header: time */
	packstr(PART_STATE_VERSION, buffer);
	pack_time(time(NULL), buffer);

	/* write partition records to buffer */
	lock_slurmctld(part_read_lock);
	part_iterator = list_iterator_create(part_list);
	while ((part_ptr = (struct part_record *) list_next(part_iterator))) {
		xassert (part_ptr->magic == PART_MAGIC);
		_dump_part_state(part_ptr, buffer);
	}
	list_iterator_destroy(part_iterator);

	old_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(old_file, "/part_state.old");
	reg_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(reg_file, "/part_state");
	new_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(new_file, "/part_state.new");
	unlock_slurmctld(part_read_lock);

	/* write the buffer to file */
	lock_state_files();
	log_fd = creat(new_file, 0600);
	if (log_fd < 0) {
		error("Can't save state, error creating file %s, %m",
		      new_file);
		error_code = errno;
	} else {
		int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
		char *data = (char *)get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(log_fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("Error writing file %s, %m", new_file);
				error_code = errno;
				break;
			}
			nwrite -= amount;
			pos    += amount;
		}

		rc = fsync_and_close(log_fd, "partition");
		if (rc && !error_code)
			error_code = rc;
	}
	if (error_code)
		(void) unlink(new_file);
	else {			/* file shuffle */
		(void) unlink(old_file);
		if (link(reg_file, old_file)) {
			debug4("unable to create link for %s -> %s: %m",
			       reg_file, old_file);
		}
		(void) unlink(reg_file);
		if (link(new_file, reg_file)) {
			debug4("unable to create link for %s -> %s: %m",
			       new_file, reg_file);
		}
		(void) unlink(new_file);
	}
	xfree(old_file);
	xfree(reg_file);
	xfree(new_file);
	unlock_state_files();

	free_buf(buffer);
	END_TIMER2("dump_all_part_state");
	return 0;
}
Example #4
0
/* dump_all_front_end_state - save the state of all front_end nodes to file */
extern int dump_all_front_end_state(void)
{
#ifdef HAVE_FRONT_END
	/* Save high-water mark to avoid buffer growth with copies */
	static int high_buffer_size = (1024 * 1024);
	int error_code = 0, i, log_fd;
	char *old_file, *new_file, *reg_file;
	front_end_record_t *front_end_ptr;
	/* Locks: Read config and node */
	slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK,
					    NO_LOCK };
	Buf buffer = init_buf(high_buffer_size);
	DEF_TIMERS;

	START_TIMER;
	/* write header: version, time */
	packstr(FRONT_END_STATE_VERSION, buffer);
	pack_time(time(NULL), buffer);

	/* write node records to buffer */
	lock_slurmctld (node_read_lock);

	for (i = 0, front_end_ptr = front_end_nodes;
	     i < front_end_node_cnt; i++, front_end_ptr++) {
		xassert(front_end_ptr->magic == FRONT_END_MAGIC);
		_dump_front_end_state(front_end_ptr, buffer);
	}

	old_file = xstrdup (slurmctld_conf.state_save_location);
	xstrcat (old_file, "/front_end_state.old");
	reg_file = xstrdup (slurmctld_conf.state_save_location);
	xstrcat (reg_file, "/front_end_state");
	new_file = xstrdup (slurmctld_conf.state_save_location);
	xstrcat (new_file, "/front_end_state.new");
	unlock_slurmctld (node_read_lock);

	/* write the buffer to file */
	lock_state_files();
	log_fd = creat (new_file, 0600);
	if (log_fd < 0) {
		error ("Can't save state, error creating file %s %m", new_file);
		error_code = errno;
	} else {
		int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
		char *data = (char *)get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(log_fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("Error writing file %s, %m", new_file);
				error_code = errno;
				break;
			}
			nwrite -= amount;
			pos    += amount;
		}

		rc = fsync_and_close(log_fd, "front_end");
		if (rc && !error_code)
			error_code = rc;
	}
	if (error_code)
		(void) unlink (new_file);
	else {	/* file shuffle */
		(void) unlink (old_file);
		if (link(reg_file, old_file))
			debug4("unable to create link for %s -> %s: %m",
			       reg_file, old_file);
		(void) unlink (reg_file);
		if (link(new_file, reg_file))
			debug4("unable to create link for %s -> %s: %m",
			       new_file, reg_file);
		(void) unlink (new_file);
	}
	xfree (old_file);
	xfree (reg_file);
	xfree (new_file);
	unlock_state_files ();

	free_buf (buffer);
	END_TIMER2("dump_all_front_end_state");
	return error_code;
#else
	return SLURM_SUCCESS;
#endif
}
Example #5
0
static int _write_last_decay_ran(time_t last_ran, time_t last_reset)
{
	/* Save high-water mark to avoid buffer growth with copies */
	static int high_buffer_size = BUF_SIZE;
	int error_code = SLURM_SUCCESS;
	int state_fd;
	char *old_file, *new_file, *state_file;
	Buf buffer;

	if (!strcmp(slurmctld_conf.state_save_location, "/dev/null")) {
		error("Can not save priority state information, "
		      "StateSaveLocation is /dev/null");
		return error_code;
	}

	buffer = init_buf(high_buffer_size);
	pack_time(last_ran, buffer);
	pack_time(last_reset, buffer);

	/* read the file */
	old_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(old_file, "/priority_last_decay_ran.old");
	state_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(state_file, "/priority_last_decay_ran");
	new_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(new_file, "/priority_last_decay_ran.new");

	lock_state_files();
	state_fd = creat(new_file, 0600);
	if (state_fd < 0) {
		error("Can't save decay state, create file %s error %m",
		      new_file);
		error_code = errno;
	} else {
		int pos = 0, nwrite = get_buf_offset(buffer), amount;
		char *data = (char *)get_buf_data(buffer);
		high_buffer_size = MAX(nwrite, high_buffer_size);
		while (nwrite > 0) {
			amount = write(state_fd, &data[pos], nwrite);
			if ((amount < 0) && (errno != EINTR)) {
				error("Error writing file %s, %m", new_file);
				error_code = errno;
				break;
			}
			nwrite -= amount;
			pos    += amount;
		}
		fsync(state_fd);
		close(state_fd);
	}

	if (error_code != SLURM_SUCCESS)
		(void) unlink(new_file);
	else {			/* file shuffle */
		(void) unlink(old_file);
		if (link(state_file, old_file))
			debug3("unable to create link for %s -> %s: %m",
			       state_file, old_file);
		(void) unlink(state_file);
		if (link(new_file, state_file))
			debug3("unable to create link for %s -> %s: %m",
			       new_file, state_file);
		(void) unlink(new_file);
	}
	xfree(old_file);
	xfree(state_file);
	xfree(new_file);

	unlock_state_files();
	debug4("done writing time %ld", (long)last_ran);
	free_buf(buffer);

	return error_code;
}
Example #6
0
static void _read_last_decay_ran(time_t *last_ran, time_t *last_reset)
{
	int data_allocated, data_read = 0;
	uint32_t data_size = 0;
	int state_fd;
	char *data = NULL, *state_file;
	Buf buffer;

	xassert(last_ran);
	xassert(last_reset);

	(*last_ran) = 0;
	(*last_reset) = 0;

	/* read the file */
	state_file = xstrdup(slurmctld_conf.state_save_location);
	xstrcat(state_file, "/priority_last_decay_ran");
	lock_state_files();
	state_fd = open(state_file, O_RDONLY);
	if (state_fd < 0) {
		info("No last decay (%s) to recover", state_file);
		unlock_state_files();
		return;
	} else {
		data_allocated = BUF_SIZE;
		data = xmalloc(data_allocated);
		while (1) {
			data_read = read(state_fd, &data[data_size],
					 BUF_SIZE);
			if (data_read < 0) {
				if (errno == EINTR)
					continue;
				else {
					error("Read error on %s: %m",
					      state_file);
					break;
				}
			} else if (data_read == 0)	/* eof */
				break;
			data_size      += data_read;
			data_allocated += data_read;
			xrealloc(data, data_allocated);
		}
		close(state_fd);
	}
	xfree(state_file);
	unlock_state_files();

	buffer = create_buf(data, data_size);
	safe_unpack_time(last_ran, buffer);
	safe_unpack_time(last_reset, buffer);
	free_buf(buffer);
	if (priority_debug)
		info("Last ran decay on jobs at %ld", (long)*last_ran);

	return;

unpack_error:
	error("Incomplete priority last decay file returning");
	free_buf(buffer);
	return;

}
Example #7
0
File: sicp.c Project: rohgarg/slurm
static void _load_sicp_state(void)
{
    int data_allocated, data_read = 0;
    uint32_t data_size = 0;
    int state_fd, sicp_cnt = 0;
    char *data = NULL, *state_file;
    struct stat stat_buf;
    Buf buffer;
    char *ver_str = NULL;
    uint32_t ver_str_len;
    uint16_t protocol_version = (uint16_t)NO_VAL;
    uint32_t job_id = 0;
    uint32_t job_state = 0;
    sicp_job_t *sicp_ptr;
    time_t buf_time, now;

    /* read the file */
    lock_state_files();
    state_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(state_file, "/sicp_state");
    state_fd = open(state_file, O_RDONLY);
    if (state_fd < 0) {
        error("Could not open job state file %s: %m", state_file);
        unlock_state_files();
        xfree(state_file);
        return;
    } else if (fstat(state_fd, &stat_buf) < 0) {
        error("Could not stat job state file %s: %m", state_file);
        unlock_state_files();
        (void) close(state_fd);
        xfree(state_file);
        return;
    } else if (stat_buf.st_size < 10) {
        error("Job state file %s too small", state_file);
        unlock_state_files();
        (void) close(state_fd);
        xfree(state_file);
        return;
    }

    data_allocated = BUF_SIZE;
    data = xmalloc(data_allocated);
    while (1) {
        data_read = read(state_fd, &data[data_size], BUF_SIZE);
        if (data_read < 0) {
            if (errno == EINTR)
                continue;
            else {
                error("Read error on %s: %m", state_file);
                break;
            }
        } else if (data_read == 0)	/* eof */
            break;
        data_size      += data_read;
        data_allocated += data_read;
        xrealloc(data, data_allocated);
    }
    close(state_fd);
    xfree(state_file);
    unlock_state_files();

    buffer = create_buf(data, data_size);
    safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer);
    debug3("Version string in sicp_state header is %s", ver_str);
    if (ver_str && !strcmp(ver_str, "PROTOCOL_VERSION"))
        safe_unpack16(&protocol_version, buffer);
    xfree(ver_str);

    if (protocol_version == (uint16_t)NO_VAL) {
        error("************************************************");
        error("Can not recover SICP state, incompatible version");
        error("************************************************");
        xfree(ver_str);
        free_buf(buffer);
        return;
    }
    safe_unpack_time(&buf_time, buffer);

    now = time(NULL);
    while (remaining_buf(buffer) > 0) {
        safe_unpack32(&job_id,    buffer);
        safe_unpack32(&job_state, buffer);
        sicp_ptr = xmalloc(sizeof(sicp_job_t));
        sicp_ptr->job_id      = job_id;
        sicp_ptr->job_state   = job_state;
        sicp_ptr->update_time = now;
        list_append(sicp_job_list, sicp_ptr);
        _add_job_hash(sicp_ptr);
        sicp_cnt++;
    }

    free_buf(buffer);
    info("Recovered information about %d sicp jobs", sicp_cnt);
    if (slurm_get_debug_flags() & DEBUG_FLAG_SICP)
        _log_sicp_recs();
    return;

unpack_error:
    error("Incomplete sicp data checkpoint file");
    info("Recovered information about %d sicp jobs", sicp_cnt);
    free_buf(buffer);
    return;
}
Example #8
0
File: sicp.c Project: rohgarg/slurm
static void _dump_sicp_state(void)
{
    char *old_file, *new_file, *reg_file;
    ListIterator sicp_iterator;
    sicp_job_t *sicp_ptr;
    Buf buffer;
    time_t now = time(NULL);
    int error_code = SLURM_SUCCESS, len, log_fd;

    pthread_mutex_lock(&sicp_lock);
    len = list_count(sicp_job_list) * 4 + 128;
    buffer = init_buf(len);

    packstr("PROTOCOL_VERSION", buffer);
    pack16(SLURM_PROTOCOL_VERSION, buffer);
    pack_time(now, buffer);

    sicp_iterator = list_iterator_create(sicp_job_list);
    while ((sicp_ptr = (sicp_job_t *) list_next(sicp_iterator))) {
        pack32(sicp_ptr->job_id, buffer);
        pack16(sicp_ptr->job_state, buffer);
    }
    list_iterator_destroy(sicp_iterator);
    pthread_mutex_unlock(&sicp_lock);

    old_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(old_file, "/sicp_state.old");
    reg_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(reg_file, "/sicp_state");
    new_file = xstrdup(slurmctld_conf.state_save_location);
    xstrcat(new_file, "/sicp_state.new");

    lock_state_files();
    log_fd = creat(new_file, 0600);
    if (log_fd < 0) {
        error("Can't save state, create file %s error %m",
              new_file);
        error_code = errno;
    } else {
        int pos = 0, nwrite, amount, rc;
        char *data;

        fd_set_close_on_exec(log_fd);
        nwrite = get_buf_offset(buffer);
        data = (char *)get_buf_data(buffer);
        while (nwrite > 0) {
            amount = write(log_fd, &data[pos], nwrite);
            if ((amount < 0) && (errno != EINTR)) {
                error("Error writing file %s, %m", new_file);
                error_code = errno;
                break;
            }
            nwrite -= amount;
            pos    += amount;
        }

        rc = fsync_and_close(log_fd, "sicp");
        if (rc && !error_code)
            error_code = rc;
    }
    if (error_code) {
        (void) unlink(new_file);
    } else {			/* file shuffle */
        (void) unlink(old_file);
        if (link(reg_file, old_file))
            debug4("unable to create link for %s -> %s: %m",
                   reg_file, old_file);
        (void) unlink(reg_file);
        if (link(new_file, reg_file))
            debug4("unable to create link for %s -> %s: %m",
                   new_file, reg_file);
        (void) unlink(new_file);
    }
    xfree(old_file);
    xfree(reg_file);
    xfree(new_file);
    unlock_state_files();

    free_buf(buffer);
}