Example #1
0
extern int slurm_persist_conn_open_without_init(
	slurm_persist_conn_t *persist_conn)
{
	slurm_addr_t addr;

	xassert(persist_conn);
	xassert(persist_conn->rem_host);
	xassert(persist_conn->rem_port);
	xassert(persist_conn->cluster_name);

	if (persist_conn->fd > 0)
		_close_fd(&persist_conn->fd);
	else
		persist_conn->fd = -1;

	if (!persist_conn->inited)
		persist_conn->inited = true;

	if (!persist_conn->version) {
		/* Set to MIN_PROTOCOL so that a higher version controller can
		 * talk to a lower protocol version controller. When talking to
		 * the DBD, the protocol version should be set to the current
		 * protocol version prior to calling this. */
		persist_conn->version = SLURM_MIN_PROTOCOL_VERSION;
	}
	if (persist_conn->timeout < 0)
		persist_conn->timeout = slurm_get_msg_timeout() * 1000;

	slurm_set_addr_char(&addr, persist_conn->rem_port,
			    persist_conn->rem_host);
	if ((persist_conn->fd = slurm_open_msg_conn(&addr)) < 0) {
		if (_comm_fail_log(persist_conn)) {
			char *s = xstrdup_printf("%s: failed to open persistent connection to %s:%d: %m",
						 __func__,
						 persist_conn->rem_host,
						 persist_conn->rem_port);
			if (persist_conn->flags & PERSIST_FLAG_SUPPRESS_ERR)
				debug2("%s", s);
			else
				error("%s", s);
			xfree(s);
		}
		return SLURM_ERROR;
	}
	fd_set_nonblocking(persist_conn->fd);
	fd_set_close_on_exec(persist_conn->fd);

	return SLURM_SUCCESS;
}
Example #2
0
static void _start_msg_tree_internal(hostlist_t hl, hostlist_t* sp_hl,
				     fwd_tree_t *fwd_tree_in,
				     int hl_count)
{
	int j;
	fwd_tree_t *fwd_tree;

	xassert((hl || sp_hl) && !(hl && sp_hl));
	xassert(fwd_tree_in);
	xassert(fwd_tree_in->p_thr_count);
	xassert(fwd_tree_in->tree_mutex);
	xassert(fwd_tree_in->notify);
	xassert(fwd_tree_in->ret_list);

	if (hl)
		xassert(hl_count == hostlist_count(hl));

	if (fwd_tree_in->timeout <= 0)
		/* convert secs to msec */
		fwd_tree_in->timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		memcpy(fwd_tree, fwd_tree_in, sizeof(fwd_tree_t));

		if (sp_hl) {
			fwd_tree->tree_hl = sp_hl[j];
			sp_hl[j] = NULL;
		} else if (hl) {
			char *name = hostlist_shift(hl);
			fwd_tree->tree_hl = hostlist_create(name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(fwd_tree->tree_mutex);
		(*fwd_tree->p_thr_count)++;
		slurm_mutex_unlock(fwd_tree->tree_mutex);

		slurm_thread_create_detached(NULL, _fwd_tree_thread, fwd_tree);
	}
}
Example #3
0
static void _load_slurm_config(void)
{
	acct_storage_backup_host = slurm_get_accounting_storage_backup_host();
	acct_storage_host = slurm_get_accounting_storage_host();
	acct_storage_loc  = slurm_get_accounting_storage_loc();
	acct_storage_pass = slurm_get_accounting_storage_pass();
	acct_storage_port = slurm_get_accounting_storage_port();
	acct_storage_type = slurm_get_accounting_storage_type();
	acct_storage_user = slurm_get_accounting_storage_user();
	auth_type = slurm_get_auth_type();
	msg_timeout = slurm_get_msg_timeout();
	plugin_dir = slurm_get_plugin_dir();
	private_data = slurm_get_private_data();
	slurm_user_id = slurm_get_slurm_user_id();
	track_wckey = slurm_get_track_wckey();
}
Example #4
0
static void _forward_msg_internal(hostlist_t hl, hostlist_t* sp_hl,
				  forward_struct_t *fwd_struct,
				  header_t *header, int timeout,
				  int hl_count)
{
	int j;
	forward_msg_t *fwd_msg = NULL;
	char *buf = NULL, *tmp_char = NULL;

	if (timeout <= 0)
		/* convert secs to msec */
		timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		fwd_msg = xmalloc(sizeof(forward_msg_t));

		fwd_msg->fwd_struct = fwd_struct;

		fwd_msg->timeout = timeout;

		memcpy(&fwd_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		fwd_msg->header.version = header->version;
		fwd_msg->header.flags = header->flags;
		fwd_msg->header.msg_type = header->msg_type;
		fwd_msg->header.body_length = header->body_length;
		fwd_msg->header.ret_list = NULL;
		fwd_msg->header.ret_cnt = 0;

		if (sp_hl) {
			buf = hostlist_ranged_string_xmalloc(sp_hl[j]);
			hostlist_destroy(sp_hl[j]);
		} else {
			tmp_char = hostlist_shift(hl);
			buf = xstrdup(tmp_char);
			free(tmp_char);
		}

		forward_init(&fwd_msg->header.forward, NULL);
		fwd_msg->header.forward.nodelist = buf;
		slurm_thread_create_detached(NULL, _forward_thread, fwd_msg);
	}
}
Example #5
0
static void _load_config(void)
{
	char *sched_params, *select_type, *tmp_ptr;

	sched_timeout = slurm_get_msg_timeout() / 2;
	sched_timeout = MAX(sched_timeout, 1);
	sched_timeout = MIN(sched_timeout, 10);

	sched_params = slurm_get_sched_params();

	if (sched_params && (tmp_ptr=strstr(sched_params, "interval=")))
		builtin_interval = atoi(tmp_ptr + 9);
	if (builtin_interval < 1) {
		error("Invalid SchedulerParameters interval: %d",
		      builtin_interval);
		builtin_interval = BACKFILL_INTERVAL;
	}

	if (sched_params && (tmp_ptr=strstr(sched_params, "max_job_bf=")))
		max_sched_job_cnt = atoi(tmp_ptr + 11);
	if (sched_params && (tmp_ptr=strstr(sched_params, "bf_max_job_test=")))
		max_sched_job_cnt = atoi(tmp_ptr + 16);
	if (max_sched_job_cnt < 1) {
		error("Invalid SchedulerParameters bf_max_job_test: %d",
		      max_sched_job_cnt);
		max_sched_job_cnt = 50;
	}
	xfree(sched_params);

	select_type = slurm_get_select_type();
	if (!xstrcmp(select_type, "select/serial")) {
		/* Do not spend time computing expected start time for
		 * pending jobs */
		max_sched_job_cnt = 0;
		stop_builtin_agent();
	}
	xfree(select_type);
}
Example #6
0
static void *_msg_thread(void *x)
{
	struct msg_arg *msg_arg_ptr = (struct msg_arg *) x;
	int rc, timeout;
	slurm_msg_t msg_send;

	slurm_msg_t_init(&msg_send);

	debug2("KVS_Barrier msg to %s:%hu",
		msg_arg_ptr->bar_ptr->hostname,
		msg_arg_ptr->bar_ptr->port);
	msg_send.msg_type = PMI_KVS_GET_RESP;
	msg_send.data = (void *) msg_arg_ptr->kvs_ptr;
	slurm_set_addr(&msg_send.address,
		msg_arg_ptr->bar_ptr->port,
		msg_arg_ptr->bar_ptr->hostname);

	timeout = slurm_get_msg_timeout() * 10000;
	if (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
		error("slurm_send_recv_rc_msg_only_one to %s:%hu : %m",
		      msg_arg_ptr->bar_ptr->hostname,
		      msg_arg_ptr->bar_ptr->port);
	} else if (rc != SLURM_SUCCESS) {
		error("KVS_Barrier confirm from %s, rc=%d",
			msg_arg_ptr->bar_ptr->hostname, rc);
	} else {
		/* successfully transmitted KVS keypairs */
	}

	slurm_mutex_lock(&agent_mutex);
	agent_cnt--;
	pthread_cond_signal(&agent_cond);
	slurm_mutex_unlock(&agent_mutex);
	xfree(x);
	return NULL;
}
Example #7
0
static void _layout_conf_dbd(GtkTreeStore *treestore)
{
	ListIterator itr = NULL;
	GtkTreeIter iter;
	config_key_pair_t *key_pair;
	int update = 0;
	time_t now = time(NULL);
	char tmp_str[128], *user_name = NULL;
	List dbd_config_list = NULL;

	/* first load accounting parms from slurm.conf */
	char *acct_storage_backup_host =
		slurm_get_accounting_storage_backup_host();
	char *acct_storage_host = slurm_get_accounting_storage_host();
	char *acct_storage_loc  = slurm_get_accounting_storage_loc();
	char *acct_storage_pass = slurm_get_accounting_storage_pass();
	uint32_t acct_storage_port = slurm_get_accounting_storage_port();
	char *acct_storage_type = slurm_get_accounting_storage_type();
	char *acct_storage_user = slurm_get_accounting_storage_user();
	char *auth_type = slurm_get_auth_type();
	uint16_t msg_timeout = slurm_get_msg_timeout();
	char *plugin_dir = slurm_get_plugin_dir();
	uint16_t private_data = slurm_get_private_data();
	uint32_t slurm_user_id = slurm_get_slurm_user_id();
	uint16_t track_wckey = slurm_get_track_wckey();

	slurm_make_time_str(&now, tmp_str, sizeof(tmp_str));
	add_display_treestore_line_with_font(
		update, treestore, &iter,
		"SLURM Configuration data as of", tmp_str, "bold");

	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageBackupHost",
				   acct_storage_backup_host);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageHost", acct_storage_host);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageLoc", acct_storage_loc);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStoragePass", acct_storage_pass);
	sprintf(tmp_str, "%u", acct_storage_port);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStoragePort", tmp_str);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageType", acct_storage_type);
	add_display_treestore_line(update, treestore, &iter,
				   "AccountingStorageUser", acct_storage_user);
	add_display_treestore_line(update, treestore, &iter,
				   "AuthType", auth_type);
	sprintf(tmp_str, "%u sec", msg_timeout);
	add_display_treestore_line(update, treestore, &iter,
				   "MessageTimeout", tmp_str);
	add_display_treestore_line(update, treestore, &iter,
				   "PluginDir", plugin_dir);
	private_data_string(private_data, tmp_str, sizeof(tmp_str));
	add_display_treestore_line(update, treestore, &iter,
				   "PrivateData", tmp_str);
	user_name = uid_to_string(slurm_user_id);
	sprintf(tmp_str, "%s(%u)", user_name, slurm_user_id);
	xfree(user_name);
	add_display_treestore_line(update, treestore, &iter,
				   "SlurmUserId", tmp_str);
	add_display_treestore_line(update, treestore, &iter,
				   "SLURM_CONF", default_slurm_config_file);
	add_display_treestore_line(update, treestore, &iter,
				   "SLURM_VERSION", SLURM_VERSION_STRING);
	sprintf(tmp_str, "%u", track_wckey);
	add_display_treestore_line(update, treestore, &iter,
				   "TrackWCKey", tmp_str);

	xfree(acct_storage_backup_host);
	xfree(acct_storage_host);
	xfree(acct_storage_loc);
	xfree(acct_storage_pass);
	xfree(acct_storage_type);
	xfree(acct_storage_user);
	xfree(auth_type);
	xfree(plugin_dir);

	/* now load accounting parms from slurmdbd.conf */

	/* second load slurmdbd.conf parms */
	if (!(dbd_config_list = slurmdb_config_get(NULL)))
		return;

	add_display_treestore_line_with_font(
		update, treestore, &iter,
		"\nSlurmDBD Configuration:", NULL, "bold");

	itr = list_iterator_create(dbd_config_list);
	while ((key_pair = list_next(itr))) {
		add_display_treestore_line(update, treestore, &iter,
					   key_pair->name,
					   key_pair->value);
	}
	list_iterator_destroy(itr);
}
Example #8
0
/* Open a connection to the Slurm DBD and set slurmdbd_conn */
static void _open_slurmdbd_conn(bool need_db)
{
	char *backup_host = NULL;
	int rc;

	if (slurmdbd_conn && slurmdbd_conn->fd >= 0) {
		debug("Attempt to re-open slurmdbd socket");
		/* clear errno (checked after this for errors) */
		errno = 0;
		return;
	}

	slurm_persist_conn_close(slurmdbd_conn);
	if (!slurmdbd_conn) {
		slurmdbd_conn = xmalloc(sizeof(slurm_persist_conn_t));
		slurmdbd_conn->flags =
			PERSIST_FLAG_DBD | PERSIST_FLAG_RECONNECT;
		slurmdbd_conn->persist_type = PERSIST_TYPE_DBD;

		if (!slurmdbd_cluster)
			slurmdbd_cluster = slurm_get_cluster_name();

		slurmdbd_conn->cluster_name = xstrdup(slurmdbd_cluster);

		slurmdbd_conn->timeout = (slurm_get_msg_timeout() + 35) * 1000;

		slurmdbd_conn->rem_port = slurm_get_accounting_storage_port();

		if (!slurmdbd_conn->rem_port) {
			slurmdbd_conn->rem_port = SLURMDBD_PORT;
			slurm_set_accounting_storage_port(
				slurmdbd_conn->rem_port);
		}
	}
	slurmdbd_shutdown = 0;
	slurmdbd_conn->shutdown = &slurmdbd_shutdown;
	slurmdbd_conn->version  = SLURM_PROTOCOL_VERSION;

	xfree(slurmdbd_conn->rem_host);
	slurmdbd_conn->rem_host = slurm_get_accounting_storage_host();
	if (!slurmdbd_conn->rem_host) {
		slurmdbd_conn->rem_host = xstrdup(DEFAULT_STORAGE_HOST);
		slurm_set_accounting_storage_host(
			slurmdbd_conn->rem_host);
	}

	// See if a backup slurmdbd is configured
	backup_host = slurm_get_accounting_storage_backup_host();

again:
	// A connection failure is only an error if backup dne or also fails
	if (backup_host)
		slurmdbd_conn->flags |= PERSIST_FLAG_SUPPRESS_ERR;
	else
		slurmdbd_conn->flags &= (~PERSIST_FLAG_SUPPRESS_ERR);

	if (((rc = slurm_persist_conn_open(slurmdbd_conn)) != SLURM_SUCCESS) &&
	    backup_host) {
		xfree(slurmdbd_conn->rem_host);
		// Force the next error to display
		slurmdbd_conn->comm_fail_time = 0;
		slurmdbd_conn->rem_host = backup_host;
		backup_host = NULL;
		goto again;
	}

	xfree(backup_host);

	if (rc == SLURM_SUCCESS) {
		/* set the timeout to the timeout to be used for all other
		 * messages */
		slurmdbd_conn->timeout = SLURMDBD_TIMEOUT * 1000;
		if (slurmdbd_conn->trigger_callbacks.dbd_resumed)
			(slurmdbd_conn->trigger_callbacks.dbd_resumed)();
		if (slurmdbd_conn->trigger_callbacks.db_resumed)
			(slurmdbd_conn->trigger_callbacks.db_resumed)();
	}

	if ((!need_db && (rc == ESLURM_DB_CONNECTION)) ||
	    (rc == SLURM_SUCCESS)) {
		debug("slurmdbd: Sent PersistInit msg");
		/* clear errno (checked after this for
		   errors)
		*/
		errno = 0;
	} else {
		if ((rc == ESLURM_DB_CONNECTION) &&
		    slurmdbd_conn->trigger_callbacks.db_fail)
			(slurmdbd_conn->trigger_callbacks.db_fail)();

		error("slurmdbd: Sending PersistInit msg: %m");
		slurm_persist_conn_close(slurmdbd_conn);
	}
}
ssize_t _slurm_msg_sendto(slurm_fd_t fd, char *buffer, size_t size,
			  uint32_t flags)
{
	return _slurm_msg_sendto_timeout( fd, buffer, size, flags,
				(slurm_get_msg_timeout() * 1000));
}
ssize_t _slurm_msg_recvfrom(slurm_fd_t fd, char **pbuf, size_t *lenp,
			    uint32_t flags)
{
	return _slurm_msg_recvfrom_timeout(fd, pbuf, lenp, flags,
				(slurm_get_msg_timeout() * 1000));
}
Example #11
0
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	static int sched_timeout = 0;
	int this_sched_timeout = 0, rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0;
	uint16_t *njobs = NULL;
	bool already_counted;

#ifdef HAVE_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	sched_start = now = time(NULL);
	if (sched_timeout == 0) {
		sched_timeout = slurm_get_msg_timeout() / 2;
		sched_timeout = MAX(sched_timeout, 1);
		sched_timeout = MIN(sched_timeout, 10);
	}
	this_sched_timeout = sched_timeout;

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true);
	if (list_count(job_queue) == 0) {
		debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	bf_last_ints = 0;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt + 3));
	node_space[0].begin_time = sched_start;
	node_space[0].end_time = sched_start + backfill_window;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	while ((job_queue_rec = (job_queue_rec_t *)
				list_pop_bottom(job_queue, sort_job_queue2))) {
		job_test_count++;
		job_ptr  = job_queue_rec->job_ptr;
		part_ptr = job_queue_rec->part_ptr;
		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill test for job %u", job_ptr->job_id);

		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					debug2("backfill: user %u: #jobs %u",
					       uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else {
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				debug2("backfill: found new user %u. "
				       "Total #users now %u",
				       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] > max_backfill_job_per_user) {
					/* skip job */
					debug("backfill: have already checked "
					      "%u jobs for user %u; skipping "
					      "job %u",
					      max_backfill_job_per_user,
					      job_ptr->user_id,
					      job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
			continue;

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (job_ptr->time_limit == NO_VAL) {
			if (part_ptr->max_time == INFINITE)
				time_limit = 365 * 24 * 60; /* one year */
			else
				time_limit = part_ptr->max_time;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_ptr->max_time);
		}
		comp_time_limit = time_limit;
		orig_time_limit = job_ptr->time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:	FREE_NULL_BITMAP(avail_bitmap);
		start_res   = later_start;
		later_start = 0;
		exc_core_bitmap = NULL;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if ((resv_end++) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}
			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		if ((time(NULL) - sched_start) >= this_sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: yielding locks after testing "
				     "%d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(backfill_interval)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 0;
			START_TIMER;
		}
		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;	
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)){
				job_ptr->time_limit = orig_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (orig_time_limit * 60);
			}
			else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (comp_time_limit * 60);
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			} else {
				job_ptr->time_limit = orig_time_limit;
			}
			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;	
				continue;
			} else if (rc != SLURM_SUCCESS) {
				/* Planned to start job, but something bad
				 * happended. */
				job_ptr->start_time = 0;	
				break;
			} else {
				/* Started this job, move to next one */
				continue;
			}
		} else
			job_ptr->time_limit = orig_time_limit;

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			/* Already have too many jobs to deal with */
			break;
		}

		end_reserve = job_ptr->start_time + (time_limit * 60);
		if (_test_resv_overlap(node_space, avail_bitmap,
				       job_ptr->start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_node_space_table(node_space);
	}
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %d jobs, %s",
		     job_test_count, TIME_STR);
	}
	return rc;
}
Example #12
0
/* Wait for barrier and get full PMI Keyval space data */
int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
		int pmi_rank, int pmi_size)
{
	int rc, srun_fd, retries = 0, timeout = 0;
	slurm_msg_t msg_send, msg_rcv;
	slurm_addr_t slurm_addr, srun_reply_addr;
	char hostname[64];
	uint16_t port;
	kvs_get_msg_t data;
	char *env_pmi_ifhn;

	if (kvs_set_ptr == NULL)
		return EINVAL;
	*kvs_set_ptr = NULL;	/* initialization */

	if ((rc = _get_addr()) != SLURM_SUCCESS) {
		error("_get_addr: %m");
		return rc;
	}

	_set_pmi_time();

	if (pmi_fd < 0) {
		if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) {
			error("slurm_init_msg_engine_port: %m");
			return SLURM_ERROR;
		}
		fd_set_blocking(pmi_fd);
	}
	if (slurm_get_stream_addr(pmi_fd, &slurm_addr) < 0) {
		error("slurm_get_stream_addr: %m");
		return SLURM_ERROR;
	}
	/* hostname is not set here, so slurm_get_addr fails
	slurm_get_addr(&slurm_addr, &port, hostname, sizeof(hostname)); */
	port = ntohs(slurm_addr.sin_port);
	if ((env_pmi_ifhn = getenv("SLURM_PMI_RESP_IFHN"))) {
		strncpy(hostname, env_pmi_ifhn, sizeof(hostname));
		hostname[sizeof(hostname)-1] = 0;
	} else
		gethostname_short(hostname, sizeof(hostname));

	data.task_id = pmi_rank;
	data.size = pmi_size;
	data.port = port;
	data.hostname = hostname;
	slurm_msg_t_init(&msg_send);
	slurm_msg_t_init(&msg_rcv);
	msg_send.address = srun_addr;
	msg_send.msg_type = PMI_KVS_GET_REQ;
	msg_send.data = &data;

	/* Send the RPC to the local srun communcation manager.
	 * Since the srun can be sent thousands of messages at
	 * the same time and refuse some connections, retry as
	 * needed. Wait until all key-pairs have been sent by
	 * all tasks then spread out messages by task's rank.
	 * Also increase the message timeout if many tasks
	 * since the srun command can get very overloaded (the
	 * default timeout is 10 secs).
	 */
	_delay_rpc(pmi_rank, pmi_size);
	if      (pmi_size > 4000)	/* 240 secs */
		timeout = slurm_get_msg_timeout() * 24000;
	else if (pmi_size > 1000)	/* 120 secs */
		timeout = slurm_get_msg_timeout() * 12000;
	else if (pmi_size > 100)	/* 60 secs */
		timeout = slurm_get_msg_timeout() * 6000;
	else if (pmi_size > 10)		/* 20 secs */
		timeout = slurm_get_msg_timeout() * 2000;

	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
		if (retries++ > MAX_RETRIES) {
			error("slurm_get_kvs_comm_set: %m");
			return SLURM_ERROR;
		} else
			debug("get kvs retry %d", retries);
		_delay_rpc(pmi_rank, pmi_size);
	}
	if (rc != SLURM_SUCCESS) {
		error("slurm_get_kvs_comm_set error_code=%d", rc);
		return rc;
	}

	/* get the message after all tasks reach the barrier */
	srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr);
	if (srun_fd < 0) {
		error("slurm_accept_msg_conn: %m");
		return errno;
	}

	while ((rc = slurm_receive_msg(srun_fd, &msg_rcv, timeout)) != 0) {
		if (errno == EINTR)
			continue;
		error("slurm_receive_msg: %m");
		slurm_close(srun_fd);
		return errno;
	}
	if (msg_rcv.auth_cred)
		(void)g_slurm_auth_destroy(msg_rcv.auth_cred);

	if (msg_rcv.msg_type != PMI_KVS_GET_RESP) {
		error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type);
		slurm_close(srun_fd);
		return SLURM_UNEXPECTED_MSG_ERROR;
	}
	if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0)
		error("slurm_send_rc_msg: %m");

	slurm_close(srun_fd);
	*kvs_set_ptr = msg_rcv.data;

	rc = _forward_comm_set(*kvs_set_ptr);
	return rc;
}
Example #13
0
/*
 * start_msg_tree  - logic to begin the forward tree and
 *                   accumulate the return codes from processes getting the
 *                   the forwarded message
 *
 * IN: hl          - hostlist_t   - list of every node to send message to
 * IN: msg         - slurm_msg_t  - message to send.
 * IN: timeout     - int          - how long to wait in milliseconds.
 * RET List 	   - List containing the responses of the childern
 *		     (if any) we forwarded the message to. List
 *		     containing type (ret_data_info_t).
 */
extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout)
{
	int *span = NULL;
	fwd_tree_t *fwd_tree = NULL;
	pthread_mutex_t tree_mutex;
	pthread_cond_t notify;
	int j = 0, count = 0;
	List ret_list = NULL;
	char *name = NULL;
	int thr_count = 0;
	int host_count = 0;

	xassert(hl);
	xassert(msg);

	hostlist_uniq(hl);
	host_count = hostlist_count(hl);

	span = set_span(host_count, 0);

	slurm_mutex_init(&tree_mutex);
	pthread_cond_init(&notify, NULL);

	ret_list = list_create(destroy_data_info);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		fwd_tree->orig_msg = msg;
		fwd_tree->ret_list = ret_list;
		fwd_tree->timeout = timeout;
		fwd_tree->notify = &notify;
		fwd_tree->tree_mutex = &tree_mutex;

		if(fwd_tree->timeout <= 0) {
			/* convert secs to msec */
			fwd_tree->timeout  = slurm_get_msg_timeout() * 1000;
		}

		fwd_tree->tree_hl = hostlist_create(name);
		free(name);
		for (j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(fwd_tree->tree_hl, name);
			free(name);
		}

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
		thr_count++;
	}
	xfree(span);

	slurm_mutex_lock(&tree_mutex);

	count = list_count(ret_list);
	debug2("Tree head got back %d looking for %d", count, host_count);
	while ((count < host_count)) {
		pthread_cond_wait(&notify, &tree_mutex);
		count = list_count(ret_list);
		debug2("Tree head got back %d", count);
	}
	debug2("Tree head got them all");
	slurm_mutex_unlock(&tree_mutex);

	slurm_mutex_destroy(&tree_mutex);
	pthread_cond_destroy(&notify);

	return ret_list;
}
Example #14
0
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && slurm_close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}
Example #15
0
static void _start_msg_tree_internal(hostlist_t hl, hostlist_t* sp_hl,
				     fwd_tree_t *fwd_tree_in,
				     int hl_count)
{
	int j;
	fwd_tree_t *fwd_tree;

	xassert((hl || sp_hl) && !(hl && sp_hl));
	xassert(fwd_tree_in);
	xassert(fwd_tree_in->p_thr_count);
	xassert(fwd_tree_in->tree_mutex);
	xassert(fwd_tree_in->notify);
	xassert(fwd_tree_in->ret_list);

	if (hl)
		xassert(hl_count == hostlist_count(hl));

	if (fwd_tree_in->timeout <= 0)
		/* convert secs to msec */
		fwd_tree_in->timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		memcpy(fwd_tree, fwd_tree_in, sizeof(fwd_tree_t));

		if (sp_hl) {
			fwd_tree->tree_hl = sp_hl[j];
			sp_hl[j] = NULL;
		} else if (hl) {
			char *name = hostlist_shift(hl);
			fwd_tree->tree_hl = hostlist_create(name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(fwd_tree->tree_mutex);
		(*fwd_tree->p_thr_count)++;
		slurm_mutex_unlock(fwd_tree->tree_mutex);

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(100000);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);

	}
}
Example #16
0
File: forward.c Project: IFCA/slurm
/*
 * start_msg_tree  - logic to begin the forward tree and
 *                   accumulate the return codes from processes getting the
 *                   the forwarded message
 *
 * IN: hl          - hostlist_t   - list of every node to send message to
 * IN: msg         - slurm_msg_t  - message to send.
 * IN: timeout     - int          - how long to wait in milliseconds.
 * RET List 	   - List containing the responses of the childern
 *		     (if any) we forwarded the message to. List
 *		     containing type (ret_data_info_t).
 */
extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout)
{
	int *span = NULL;
	fwd_tree_t *fwd_tree = NULL;
	pthread_mutex_t tree_mutex;
	pthread_cond_t notify;
	int j = 0, count = 0;
	List ret_list = NULL;
	char *name = NULL;
	int thr_count = 0;
	int host_count = 0;

	xassert(hl);
	xassert(msg);

	hostlist_uniq(hl);
	host_count = hostlist_count(hl);

	span = set_span(host_count, 0);

	slurm_mutex_init(&tree_mutex);
	pthread_cond_init(&notify, NULL);

	ret_list = list_create(destroy_data_info);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		fwd_tree->orig_msg = msg;
		fwd_tree->ret_list = ret_list;
		fwd_tree->timeout = timeout;
		fwd_tree->notify = &notify;
		fwd_tree->p_thr_count = &thr_count;
		fwd_tree->tree_mutex = &tree_mutex;

		if (fwd_tree->timeout <= 0) {
			/* convert secs to msec */
			fwd_tree->timeout  = slurm_get_msg_timeout() * 1000;
		}

		fwd_tree->tree_hl = hostlist_create(name);
		free(name);
		for (j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(fwd_tree->tree_hl, name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(&tree_mutex);
		thr_count++;
		slurm_mutex_unlock(&tree_mutex);

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);

	}
	xfree(span);

	slurm_mutex_lock(&tree_mutex);

	count = list_count(ret_list);
	debug2("Tree head got back %d looking for %d", count, host_count);
	while (thr_count > 0) {
		pthread_cond_wait(&notify, &tree_mutex);
		count = list_count(ret_list);
		debug2("Tree head got back %d", count);
	}
	xassert(count >= host_count);	/* Tree head did not get all responses,
					 * but no more active fwd threads!*/
	slurm_mutex_unlock(&tree_mutex);

	slurm_mutex_destroy(&tree_mutex);
	pthread_cond_destroy(&notify);

	return ret_list;
}
Example #17
0
File: forward.c Project: IFCA/slurm
/*
 * forward_msg        - logic to forward a message which has been received and
 *                      accumulate the return codes from processes getting the
 *                      the forwarded message
 *
 * IN: forward_struct - forward_struct_t *   - holds information about message
 *                                             that needs to be forwarded to
 *                                             childern processes
 * IN: header         - header_t             - header from message that came in
 *                                             needing to be forwarded.
 * RET: SLURM_SUCCESS - int
 */
extern int forward_msg(forward_struct_t *forward_struct,
		       header_t *header)
{
	int j = 0;
	int retries = 0;
	forward_msg_t *forward_msg = NULL;
	int thr_count = 0;
	int *span = set_span(header->forward.cnt, 0);
	hostlist_t hl = NULL;
	hostlist_t forward_hl = NULL;
	char *name = NULL;

	if (!forward_struct->ret_list) {
		error("didn't get a ret_list from forward_struct");
		xfree(span);
		return SLURM_ERROR;
	}
	hl = hostlist_create(header->forward.nodelist);
	hostlist_uniq(hl);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		char *buf = NULL;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		forward_msg = &forward_struct->forward_msg[thr_count];
		forward_msg->ret_list = forward_struct->ret_list;

		forward_msg->timeout = forward_struct->timeout;

		if (forward_msg->timeout <= 0) {
			/* convert secs to msec */
			forward_msg->timeout  = slurm_get_msg_timeout() * 1000;
		}

		forward_msg->notify = &forward_struct->notify;
		forward_msg->forward_mutex = &forward_struct->forward_mutex;
		forward_msg->buf_len = forward_struct->buf_len;
		forward_msg->buf = forward_struct->buf;

		memcpy(&forward_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		forward_msg->header.version = header->version;
		forward_msg->header.flags = header->flags;
		forward_msg->header.msg_type = header->msg_type;
		forward_msg->header.body_length = header->body_length;
		forward_msg->header.ret_list = NULL;
		forward_msg->header.ret_cnt = 0;

		forward_hl = hostlist_create(name);
		free(name);
		for(j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(forward_hl, name);
			free(name);
		}

		buf = hostlist_ranged_string_xmalloc(forward_hl);
		hostlist_destroy(forward_hl);
		forward_init(&forward_msg->header.forward, NULL);
		forward_msg->header.forward.nodelist = buf;
		while (pthread_create(&thread_agent, &attr_agent,
				     _forward_thread,
				     (void *)forward_msg)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
		thr_count++;
	}
	hostlist_destroy(hl);
	xfree(span);
	return SLURM_SUCCESS;
}
Example #18
0
/* Open a connection to the Slurm DBD and set slurmdbd_conn */
static void _open_slurmdbd_conn(bool need_db)
{
	bool try_backup = true;
	int rc;

	if (slurmdbd_conn && slurmdbd_conn->fd >= 0) {
		debug("Attempt to re-open slurmdbd socket");
		/* clear errno (checked after this for errors) */
		errno = 0;
		return;
	}

	slurm_persist_conn_close(slurmdbd_conn);
	if (!slurmdbd_conn) {
		slurmdbd_conn = xmalloc(sizeof(slurm_persist_conn_t));
		slurmdbd_conn->flags =
			PERSIST_FLAG_DBD | PERSIST_FLAG_RECONNECT;
		slurmdbd_conn->persist_type = PERSIST_TYPE_DBD;

		if (!slurmdbd_cluster)
			slurmdbd_cluster = slurm_get_cluster_name();

		slurmdbd_conn->cluster_name = xstrdup(slurmdbd_cluster);

		slurmdbd_conn->timeout = (slurm_get_msg_timeout() + 35) * 1000;

		slurmdbd_conn->rem_port = slurm_get_accounting_storage_port();

		if (!slurmdbd_conn->rem_port) {
			slurmdbd_conn->rem_port = SLURMDBD_PORT;
			slurm_set_accounting_storage_port(
				slurmdbd_conn->rem_port);
		}
	}
	slurmdbd_shutdown = 0;
	slurmdbd_conn->shutdown = &slurmdbd_shutdown;
	slurmdbd_conn->version  = SLURM_PROTOCOL_VERSION;

	xfree(slurmdbd_conn->rem_host);
	slurmdbd_conn->rem_host = slurm_get_accounting_storage_host();
	if (!slurmdbd_conn->rem_host) {
		slurmdbd_conn->rem_host = xstrdup(DEFAULT_STORAGE_HOST);
		slurm_set_accounting_storage_host(
			slurmdbd_conn->rem_host);
	}

again:

	if (((rc = slurm_persist_conn_open(slurmdbd_conn)) != SLURM_SUCCESS) &&
	    try_backup) {
		xfree(slurmdbd_conn->rem_host);
		try_backup = false;
		if ((slurmdbd_conn->rem_host =
		     slurm_get_accounting_storage_backup_host()))
			goto again;
	}

	if (rc == SLURM_SUCCESS) {
		/* set the timeout to the timeout to be used for all other
		 * messages */
		slurmdbd_conn->timeout = SLURMDBD_TIMEOUT * 1000;
		if (slurmdbd_conn->trigger_callbacks.dbd_resumed)
			(slurmdbd_conn->trigger_callbacks.dbd_resumed)();
		if (slurmdbd_conn->trigger_callbacks.db_resumed)
			(slurmdbd_conn->trigger_callbacks.db_resumed)();
	}

	if ((!need_db && (rc == ESLURM_DB_CONNECTION)) ||
	    (rc == SLURM_SUCCESS)) {
		debug("slurmdbd: Sent PersistInit msg");
		/* clear errno (checked after this for
		   errors)
		*/
		errno = 0;
	} else {
		if ((rc == ESLURM_DB_CONNECTION) &&
		    slurmdbd_conn->trigger_callbacks.db_fail)
			(slurmdbd_conn->trigger_callbacks.db_fail)();

		error("slurmdbd: Sending PersistInit msg: %m");
		slurm_persist_conn_close(slurmdbd_conn);
	}
}
Example #19
0
static int _open_controller_conn(slurmdb_cluster_rec_t *cluster, bool locked)
{
	int rc;
	slurm_persist_conn_t *persist_conn = NULL;
	static int timeout = -1;

	if (timeout < 0)
		timeout = slurm_get_msg_timeout() * 1000;

	if (cluster == fed_mgr_cluster_rec) {
		info("%s: hey! how did we get here with ourselves?", __func__);
		return SLURM_ERROR;
	}

	if (!locked)
		slurm_mutex_lock(&cluster->lock);

	if (!cluster->control_host || !cluster->control_host[0] ||
	    !cluster->control_port) {
		if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
			info("%s: Sibling cluster %s doesn't appear up yet, skipping",
			     __func__, cluster->name);
		if (!locked)
			slurm_mutex_unlock(&cluster->lock);
		return SLURM_ERROR;
	}

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
		info("opening sibling conn to %s", cluster->name);

	if (!cluster->fed.send) {
		persist_conn = xmalloc(sizeof(slurm_persist_conn_t));

		cluster->fed.send = persist_conn;

		/* Since this connection is coming from us, make it so ;) */
		persist_conn->cluster_name = xstrdup(slurmctld_cluster_name);
		persist_conn->my_port = slurmctld_conf.slurmctld_port;
		persist_conn->rem_host = xstrdup(cluster->control_host);
		persist_conn->rem_port = cluster->control_port;
		persist_conn->shutdown = &slurmctld_config.shutdown_time;
		persist_conn->timeout = timeout; /* don't put this as 0 it
						  * could cause deadlock */
	} else {
		persist_conn = cluster->fed.send;

		/* Perhaps a backup came up, so don't assume it was the same
		 * host or port we had before.
		 */
		xfree(persist_conn->rem_host);
		persist_conn->rem_host = xstrdup(cluster->control_host);
		persist_conn->rem_port = cluster->control_port;
	}

	rc = slurm_persist_conn_open(persist_conn);
	if (rc != SLURM_SUCCESS) {
		error("fed_mgr: Unable to open connection to cluster %s using host %s(%u)",
		      cluster->name,
		      persist_conn->rem_host, persist_conn->rem_port);
	} else if (slurmctld_conf.debug_flags & DEBUG_FLAG_FEDR)
		info("opened sibling conn to %s:%d",
		     cluster->name, persist_conn->fd);
	if (!locked)
		slurm_mutex_unlock(&cluster->lock);

	return rc;
}
Example #20
0
static void _forward_msg_internal(hostlist_t hl, hostlist_t* sp_hl,
				  forward_struct_t *fwd_struct,
				  header_t *header, int timeout,
				  int hl_count)
{
	int j;
	forward_msg_t *fwd_msg = NULL;
	char *buf = NULL, *tmp_char = NULL;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;

	if (timeout <= 0)
		/* convert secs to msec */
		timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_msg = xmalloc(sizeof(forward_msg_t));

		fwd_msg->fwd_struct = fwd_struct;

		fwd_msg->timeout = timeout;

		memcpy(&fwd_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		fwd_msg->header.version = header->version;
		fwd_msg->header.flags = header->flags;
		fwd_msg->header.msg_type = header->msg_type;
		fwd_msg->header.body_length = header->body_length;
		fwd_msg->header.ret_list = NULL;
		fwd_msg->header.ret_cnt = 0;

		if (sp_hl) {
			buf = hostlist_ranged_string_xmalloc(sp_hl[j]);
			hostlist_destroy(sp_hl[j]);
		} else {
			tmp_char = hostlist_shift(hl);
			buf = xstrdup(tmp_char);
			free(tmp_char);
		}

		forward_init(&fwd_msg->header.forward, NULL);
		fwd_msg->header.forward.nodelist = buf;
		while (pthread_create(&thread_agent, &attr_agent,
				     _forward_thread,
				     (void *)fwd_msg)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(100000);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
	}
}
Example #21
0
/*
 * agent - party responsible for transmitting an common RPC in parallel
 *	across a set of nodes. Use agent_queue_request() if immediate
 *	execution is not essential.
 * IN pointer to agent_arg_t, which is xfree'd (including hostlist,
 *	and msg_args) upon completion
 * RET always NULL (function format just for use as pthread)
 */
void *agent(void *args)
{
	int i, delay, rc, retries = 0;
	pthread_attr_t attr_wdog;
	pthread_t thread_wdog;
	agent_arg_t *agent_arg_ptr = args;
	agent_info_t *agent_info_ptr = NULL;
	thd_t *thread_ptr;
	task_info_t *task_specific_ptr;
	time_t begin_time;

#if 0
	info("Agent_cnt is %d of %d with msg_type %d",
	     agent_cnt, MAX_AGENT_CNT, agent_arg_ptr->msg_type);
#endif
	slurm_mutex_lock(&agent_cnt_mutex);
	if (!wiki2_sched_test) {
		char *sched_type = slurm_get_sched_type();
		if (strcmp(sched_type, "sched/wiki2") == 0)
			wiki2_sched = true;
		xfree(sched_type);
		wiki2_sched_test = true;
	}

	while (1) {
		if (slurmctld_config.shutdown_time ||
		    (agent_cnt < MAX_AGENT_CNT)) {
			agent_cnt++;
			break;
		} else {	/* wait for state change and retry */
			pthread_cond_wait(&agent_cnt_cond, &agent_cnt_mutex);
		}
	}
	slurm_mutex_unlock(&agent_cnt_mutex);
	if (slurmctld_config.shutdown_time)
		goto cleanup;

	/* basic argument value tests */
	begin_time = time(NULL);
	if (_valid_agent_arg(agent_arg_ptr))
		goto cleanup;

	/* initialize the agent data structures */
	agent_info_ptr = _make_agent_info(agent_arg_ptr);
	thread_ptr = agent_info_ptr->thread_struct;

	/* start the watchdog thread */
	slurm_attr_init(&attr_wdog);
	if (pthread_attr_setdetachstate
	    (&attr_wdog, PTHREAD_CREATE_JOINABLE))
		error("pthread_attr_setdetachstate error %m");
	while (pthread_create(&thread_wdog, &attr_wdog, _wdog,
				(void *) agent_info_ptr)) {
		error("pthread_create error %m");
		if (++retries > MAX_RETRIES)
			fatal("Can't create pthread");
		usleep(10000);	/* sleep and retry */
	}
	slurm_attr_destroy(&attr_wdog);
#if 	AGENT_THREAD_COUNT < 1
	fatal("AGENT_THREAD_COUNT value is invalid");
#endif
	debug2("got %d threads to send out",agent_info_ptr->thread_count);
	/* start all the other threads (up to AGENT_THREAD_COUNT active) */
	for (i = 0; i < agent_info_ptr->thread_count; i++) {

		/* wait until "room" for another thread */
		slurm_mutex_lock(&agent_info_ptr->thread_mutex);
		while (agent_info_ptr->threads_active >=
		       AGENT_THREAD_COUNT) {
			pthread_cond_wait(&agent_info_ptr->thread_cond,
					  &agent_info_ptr->thread_mutex);
		}

		/* create thread specific data, NOTE: freed from
		 *      _thread_per_group_rpc() */
		task_specific_ptr = _make_task_data(agent_info_ptr, i);

		slurm_attr_init(&thread_ptr[i].attr);
		if (pthread_attr_setdetachstate(&thread_ptr[i].attr,
						PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");
		while ((rc = pthread_create(&thread_ptr[i].thread,
					    &thread_ptr[i].attr,
					    _thread_per_group_rpc,
					    (void *) task_specific_ptr))) {
			error("pthread_create error %m");
			if (agent_info_ptr->threads_active)
				pthread_cond_wait(&agent_info_ptr->
						  thread_cond,
						  &agent_info_ptr->
						  thread_mutex);
			else {
				slurm_mutex_unlock(&agent_info_ptr->
						     thread_mutex);
				usleep(10000);	/* sleep and retry */
				slurm_mutex_lock(&agent_info_ptr->
						   thread_mutex);
			}
		}
		slurm_attr_destroy(&thread_ptr[i].attr);
		agent_info_ptr->threads_active++;
		slurm_mutex_unlock(&agent_info_ptr->thread_mutex);
	}

	/* wait for termination of remaining threads */
	pthread_join(thread_wdog, NULL);
	delay = (int) difftime(time(NULL), begin_time);
	if (delay > (slurm_get_msg_timeout() * 2)) {
		info("agent msg_type=%u ran for %d seconds",
			agent_arg_ptr->msg_type,  delay);
	}
	slurm_mutex_lock(&agent_info_ptr->thread_mutex);
	while (agent_info_ptr->threads_active != 0) {
		pthread_cond_wait(&agent_info_ptr->thread_cond,
				&agent_info_ptr->thread_mutex);
	}
	slurm_mutex_unlock(&agent_info_ptr->thread_mutex);

      cleanup:
	_purge_agent_args(agent_arg_ptr);

	if (agent_info_ptr) {
		xfree(agent_info_ptr->thread_struct);
		xfree(agent_info_ptr);
	}
	slurm_mutex_lock(&agent_cnt_mutex);

	if (agent_cnt > 0)
		agent_cnt--;
	else {
		error("agent_cnt underflow");
		agent_cnt = 0;
	}

	if (agent_cnt && agent_cnt < MAX_AGENT_CNT)
		agent_retry(RPC_RETRY_INTERVAL, true);

	pthread_cond_broadcast(&agent_cnt_cond);
	slurm_mutex_unlock(&agent_cnt_mutex);

	return NULL;
}
Example #22
0
extern void msg_aggr_add_msg(slurm_msg_t *msg, bool wait,
			     void (*resp_callback) (slurm_msg_t *msg))
{
	int count;
	static uint16_t msg_index = 1;
	static uint32_t wait_count = 0;

	if (!msg_collection.running)
		return;

	slurm_mutex_lock(&msg_collection.mutex);
	if (msg_collection.max_msgs == true) {
		slurm_cond_wait(&msg_collection.cond, &msg_collection.mutex);
	}

	msg->msg_index = msg_index++;

	/* Add msg to message collection */
	list_append(msg_collection.msg_list, msg);

	count = list_count(msg_collection.msg_list);


	/* First msg in collection; initiate new window */
	if (count == 1)
		slurm_cond_signal(&msg_collection.cond);

	/* Max msgs reached; terminate window */
	if (count >= msg_collection.max_msg_cnt) {
		msg_collection.max_msgs = true;
		slurm_cond_signal(&msg_collection.cond);
	}
	slurm_mutex_unlock(&msg_collection.mutex);

	if (wait) {
		msg_aggr_t *msg_aggr = xmalloc(sizeof(msg_aggr_t));
		uint16_t        msg_timeout;
		struct timeval  now;
		struct timespec timeout;

		msg_aggr->msg_index = msg->msg_index;
		msg_aggr->resp_callback = resp_callback;
		slurm_cond_init(&msg_aggr->wait_cond, NULL);

		slurm_mutex_lock(&msg_collection.aggr_mutex);
		list_append(msg_collection.msg_aggr_list, msg_aggr);

		msg_timeout = slurm_get_msg_timeout();
		gettimeofday(&now, NULL);
		timeout.tv_sec = now.tv_sec + msg_timeout;
		timeout.tv_nsec = now.tv_usec * 1000;

		wait_count++;
		if (pthread_cond_timedwait(&msg_aggr->wait_cond,
					   &msg_collection.aggr_mutex,
					   &timeout) == ETIMEDOUT)
			_handle_msg_aggr_ret(msg_aggr->msg_index, 1);
		wait_count--;
		slurm_mutex_unlock(&msg_collection.aggr_mutex);
;
		if (!msg_collection.running && !wait_count)
			slurm_mutex_destroy(&msg_collection.aggr_mutex);

		_msg_aggr_free(msg_aggr);
	}
}
Example #23
0
static int _attempt_backfill(void)
{
    bool filter_root = false;
    List job_queue;
    job_queue_rec_t *job_queue_rec;
    slurmdb_qos_rec_t *qos_ptr = NULL;
    int i, j, node_space_recs;
    struct job_record *job_ptr;
    struct part_record *part_ptr;
    uint32_t end_time, end_reserve;
    uint32_t time_limit, comp_time_limit, orig_time_limit;
    uint32_t min_nodes, max_nodes, req_nodes;
    bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
    time_t now = time(NULL), sched_start, later_start, start_res;
    node_space_map_t *node_space;
    static int sched_timeout = 0;
    int this_sched_timeout = 0, rc = 0;

    sched_start = now;
    if (sched_timeout == 0) {
        sched_timeout = slurm_get_msg_timeout() / 2;
        sched_timeout = MAX(sched_timeout, 1);
        sched_timeout = MIN(sched_timeout, 10);
    }
    this_sched_timeout = sched_timeout;

#ifdef HAVE_CRAY
    /*
     * Run a Basil Inventory immediately before setting up the schedule
     * plan, to avoid race conditions caused by ALPS node state change.
     * Needs to be done with the node-state lock taken.
     */
    if (select_g_reconfigure()) {
        debug4("backfill: not scheduling due to ALPS");
        return SLURM_SUCCESS;
    }
#endif

    if (slurm_get_root_filter())
        filter_root = true;

    job_queue = build_job_queue(true);
    if (list_count(job_queue) <= 1) {
        debug("backfill: no jobs to backfill");
        list_destroy(job_queue);
        return 0;
    }

    node_space = xmalloc(sizeof(node_space_map_t) *
                         (max_backfill_job_cnt + 3));
    node_space[0].begin_time = sched_start;
    node_space[0].end_time = sched_start + backfill_window;
    node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
    node_space[0].next = 0;
    node_space_recs = 1;
    if (debug_flags & DEBUG_FLAG_BACKFILL)
        _dump_node_space_table(node_space);

    while ((job_queue_rec = (job_queue_rec_t *)
                            list_pop_bottom(job_queue, sort_job_queue2))) {
        job_ptr  = job_queue_rec->job_ptr;
        part_ptr = job_queue_rec->part_ptr;
        xfree(job_queue_rec);
        if (!IS_JOB_PENDING(job_ptr))
            continue;	/* started in other partition */
        job_ptr->part_ptr = part_ptr;

        if (debug_flags & DEBUG_FLAG_BACKFILL)
            info("backfill test for job %u", job_ptr->job_id);

        if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) ||
                (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) ||
                (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT) ||
                (job_ptr->state_reason == WAIT_QOS_JOB_LIMIT) ||
                (job_ptr->state_reason == WAIT_QOS_RESOURCE_LIMIT) ||
                (job_ptr->state_reason == WAIT_QOS_TIME_LIMIT) ||
                !acct_policy_job_runnable(job_ptr)) {
            debug2("backfill: job %u is not allowed to run now. "
                   "Skipping it. State=%s. Reason=%s. Priority=%u",
                   job_ptr->job_id,
                   job_state_string(job_ptr->job_state),
                   job_reason_string(job_ptr->state_reason),
                   job_ptr->priority);
            continue;
        }

        if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
                (part_ptr->node_bitmap == NULL))
            continue;
        if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
            continue;

        if ((!job_independent(job_ptr, 0)) ||
                (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
            continue;

        /* Determine minimum and maximum node counts */
        min_nodes = MAX(job_ptr->details->min_nodes,
                        part_ptr->min_nodes);
        if (job_ptr->details->max_nodes == 0)
            max_nodes = part_ptr->max_nodes;
        else
            max_nodes = MIN(job_ptr->details->max_nodes,
                            part_ptr->max_nodes);
        max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
        if (job_ptr->details->max_nodes)
            req_nodes = max_nodes;
        else
            req_nodes = min_nodes;
        if (min_nodes > max_nodes) {
            /* job's min_nodes exceeds partition's max_nodes */
            continue;
        }

        /* Determine job's expected completion time */
        if (job_ptr->time_limit == NO_VAL) {
            if (part_ptr->max_time == INFINITE)
                time_limit = 365 * 24 * 60; /* one year */
            else
                time_limit = part_ptr->max_time;
        } else {
            if (part_ptr->max_time == INFINITE)
                time_limit = job_ptr->time_limit;
            else
                time_limit = MIN(job_ptr->time_limit,
                                 part_ptr->max_time);
        }
        comp_time_limit = time_limit;
        orig_time_limit = job_ptr->time_limit;
        if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
            time_limit = job_ptr->time_limit = 1;
        else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
            time_limit = job_ptr->time_limit = job_ptr->time_min;

        /* Determine impact of any resource reservations */
        later_start = now;
TRY_LATER:
        FREE_NULL_BITMAP(avail_bitmap);
        start_res   = later_start;
        later_start = 0;
        j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap);
        if (j != SLURM_SUCCESS) {
            job_ptr->time_limit = orig_time_limit;
            continue;
        }
        if (start_res > now)
            end_time = (time_limit * 60) + start_res;
        else
            end_time = (time_limit * 60) + now;

        /* Identify usable nodes for this job */
        bit_and(avail_bitmap, part_ptr->node_bitmap);
        bit_and(avail_bitmap, up_node_bitmap);
        for (j=0; ; ) {
            if ((node_space[j].end_time > start_res) &&
                    node_space[j].next && (later_start == 0))
                later_start = node_space[j].end_time;
            if (node_space[j].end_time <= start_res)
                ;
            else if (node_space[j].begin_time <= end_time) {
                bit_and(avail_bitmap,
                        node_space[j].avail_bitmap);
            } else
                break;
            if ((j = node_space[j].next) == 0)
                break;
        }

        if (job_ptr->details->exc_node_bitmap) {
            bit_not(job_ptr->details->exc_node_bitmap);
            bit_and(avail_bitmap,
                    job_ptr->details->exc_node_bitmap);
            bit_not(job_ptr->details->exc_node_bitmap);
        }

        /* Test if insufficient nodes remain OR
         *	required nodes missing OR
         *	nodes lack features */
        if ((bit_set_count(avail_bitmap) < min_nodes) ||
                ((job_ptr->details->req_node_bitmap) &&
                 (!bit_super_set(job_ptr->details->req_node_bitmap,
                                 avail_bitmap))) ||
                (job_req_node_filter(job_ptr, avail_bitmap))) {
            if (later_start) {
                job_ptr->start_time = 0;
                goto TRY_LATER;
            }
            job_ptr->time_limit = orig_time_limit;
            continue;
        }

        /* Identify nodes which are definitely off limits */
        FREE_NULL_BITMAP(resv_bitmap);
        resv_bitmap = bit_copy(avail_bitmap);
        bit_not(resv_bitmap);

        if ((time(NULL) - sched_start) >= this_sched_timeout) {
            debug("backfill: loop taking too long, yielding locks");
            if (_yield_locks()) {
                debug("backfill: system state changed, "
                      "breaking out");
                rc = 1;
                break;
            } else {
                this_sched_timeout += sched_timeout;
            }
        }
        /* this is the time consuming operation */
        debug2("backfill: entering _try_sched for job %u.",
               job_ptr->job_id);
        j = _try_sched(job_ptr, &avail_bitmap,
                       min_nodes, max_nodes, req_nodes);
        debug2("backfill: finished _try_sched for job %u.",
               job_ptr->job_id);
        now = time(NULL);
        if (j != SLURM_SUCCESS) {
            job_ptr->time_limit = orig_time_limit;
            job_ptr->start_time = 0;
            continue;	/* not runable */
        }

        if (start_res > job_ptr->start_time) {
            job_ptr->start_time = start_res;
            last_job_update = now;
        }
        if (job_ptr->start_time <= now) {
            int rc = _start_job(job_ptr, resv_bitmap);
            if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
                job_ptr->time_limit = orig_time_limit;
            else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
                /* Set time limit as high as possible */
                job_ptr->time_limit = comp_time_limit;
                job_ptr->end_time = job_ptr->start_time +
                                    (comp_time_limit * 60);
                _reset_job_time_limit(job_ptr, now,
                                      node_space);
                time_limit = job_ptr->time_limit;
            } else {
                job_ptr->time_limit = orig_time_limit;
            }
            if (rc == ESLURM_ACCOUNTING_POLICY) {
                /* Unknown future start time, just skip job */
                job_ptr->start_time = 0;
                continue;
            } else if (rc != SLURM_SUCCESS) {
                /* Planned to start job, but something bad
                 * happended. */
                job_ptr->start_time = 0;
                break;
            } else {
                /* Started this job, move to next one */
                continue;
            }
        } else
            job_ptr->time_limit = orig_time_limit;

        if (later_start && (job_ptr->start_time > later_start)) {
            /* Try later when some nodes currently reserved for
             * pending jobs are free */
            job_ptr->start_time = 0;
            goto TRY_LATER;
        }

        if (job_ptr->start_time > (sched_start + backfill_window)) {
            /* Starts too far in the future to worry about */
            continue;
        }

        if (node_space_recs >= max_backfill_job_cnt) {
            /* Already have too many jobs to deal with */
            break;
        }

        end_reserve = job_ptr->start_time + (time_limit * 60);
        if (_test_resv_overlap(node_space, avail_bitmap,
                               job_ptr->start_time, end_reserve)) {
            /* This job overlaps with an existing reservation for
             * job to be backfill scheduled, which the sched
             * plugin does not know about. Try again later. */
            later_start = job_ptr->start_time;
            job_ptr->start_time = 0;
            goto TRY_LATER;
        }

        /*
         * Add reservation to scheduling table if appropriate
         */
        qos_ptr = job_ptr->qos_ptr;
        if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
            continue;
        bit_not(avail_bitmap);
        _add_reservation(job_ptr->start_time, end_reserve,
                         avail_bitmap, node_space, &node_space_recs);
        if (debug_flags & DEBUG_FLAG_BACKFILL)
            _dump_node_space_table(node_space);
    }
    FREE_NULL_BITMAP(avail_bitmap);
    FREE_NULL_BITMAP(resv_bitmap);

    for (i=0; ; ) {
        FREE_NULL_BITMAP(node_space[i].avail_bitmap);
        if ((i = node_space[i].next) == 0)
            break;
    }
    xfree(node_space);
    list_destroy(job_queue);
    return rc;
}