예제 #1
0
void pty_thread_create(srun_job_t *job)
{
	slurm_addr_t pty_addr;
	pthread_attr_t attr;

	if ((job->pty_fd = slurm_init_msg_engine_port(0)) < 0) {
		error("init_msg_engine_port: %m");
		return;
	}
	if (slurm_get_stream_addr(job->pty_fd, &pty_addr) < 0) {
		error("slurm_get_stream_addr: %m");
		return;
	}
	job->pty_port = ntohs(((struct sockaddr_in) pty_addr).sin_port);
	debug2("initialized job control port %hu", job->pty_port);

	slurm_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
	if ((pthread_create(&job->pty_id, &attr, &_pty_thread, (void *) job))) {
		job->pty_id = 0;
		error("pthread_create(pty_thread): %m");
	}
	slurm_attr_destroy(&attr);
}
예제 #2
0
파일: req.c 프로젝트: VURM/slurm
int
msg_thr_create(slurmd_job_t *job)
{
	int fd;
	eio_obj_t *eio_obj;
	pthread_attr_t attr;
	int rc = SLURM_SUCCESS, retries = 0;
	errno = 0;
	fd = _domain_socket_create(conf->spooldir, conf->node_name,
				   job->jobid, job->stepid);
	if (fd == -1)
		return SLURM_ERROR;

	fd_set_nonblocking(fd);

	eio_obj = eio_obj_create(fd, &msg_socket_ops, (void *)job);
	job->msg_handle = eio_handle_create();
	eio_new_initial_obj(job->msg_handle, eio_obj);

	slurm_attr_init(&attr);

	while (pthread_create(&job->msgid, &attr,
			      &_msg_thr_internal, (void *)job)) {
		error("msg_thr_create: pthread_create error %m");
		if (++retries > MAX_RETRIES) {
			error("msg_thr_create: Can't create pthread");
			rc = SLURM_ERROR;
			break;
		}
		usleep(10);	/* sleep and again */
	}

	slurm_attr_destroy(&attr);

	return rc;
}
예제 #3
0
파일: agent.c 프로젝트: alepharchives/slurm
/* _spawn_retry_agent - pthread_create an agent for the given task */
static void _spawn_retry_agent(agent_arg_t * agent_arg_ptr)
{
	int retries = 0;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;

	if (agent_arg_ptr == NULL)
		return;

	debug2("Spawning RPC agent for msg_type %u",
	       agent_arg_ptr->msg_type);
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent,
					PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	while (pthread_create(&thread_agent, &attr_agent,
			agent, (void *) agent_arg_ptr)) {
		error("pthread_create error %m");
		if (++retries > MAX_RETRIES)
			fatal("Can't create pthread");
		usleep(10000);	/* sleep and retry */
	}
	slurm_attr_destroy(&attr_agent);
}
예제 #4
0
int main(int argc, char *argv[])
{
	log_options_t log_opts = LOG_OPTS_INITIALIZER;
	char *features, *save_ptr = NULL, *tok;
	update_node_msg_t node_msg;
	int rc =  SLURM_SUCCESS;
	hostlist_t hl = NULL;
	char *node_name;
	pthread_attr_t attr_work;
	pthread_t thread_work = 0;

	prog_name = argv[0];
	_read_config();
	log_opts.stderr_level = LOG_LEVEL_QUIET;
	log_opts.syslog_level = LOG_LEVEL_QUIET;
	if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES)
		log_opts.logfile_level += 3;
	(void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);

	/* Parse the MCDRAM and NUMA boot options */
	if (argc == 3) {
		features = xstrdup(argv[2]);
		tok = strtok_r(features, ",", &save_ptr);
		while (tok) {
			printf("%s\n", tok);
			if (!strcasecmp(tok, "a2a")  ||
			    !strcasecmp(tok, "hemi") ||
			    !strcasecmp(tok, "quad") ||
			    !strcasecmp(tok, "snc2") ||
			    !strcasecmp(tok, "snc4")) {
				xfree(mcdram_mode);
				mcdram_mode = xstrdup(tok);
			} else if (!strcasecmp(tok, "cache")  ||
				   !strcasecmp(tok, "equal") ||
				   !strcasecmp(tok, "flat")) {
				xfree(numa_mode);
				numa_mode = xstrdup(tok);
			}
			tok = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(features);
	}

	/* Spawn threads to change MCDRAM and NUMA states and start node
	 * reboot process */
	if ((hl = hostlist_create(argv[1])) == NULL) {
		error("%s: Invalid hostlist (%s)", prog_name, argv[1]);
		exit(2);
	}
	node_bitmap = bit_alloc(100000);
	while ((node_name = hostlist_pop(hl))) {
		slurm_mutex_lock(&thread_cnt_mutex);
		while (1) {
			if (thread_cnt <= MAX_THREADS) {
				thread_cnt++;
				break;
			} else {	/* wait for state change and retry */
				pthread_cond_wait(&thread_cnt_cond,
						  &thread_cnt_mutex);
			}
		}
		slurm_mutex_unlock(&thread_cnt_mutex);

		slurm_attr_init(&attr_work);
		(void) pthread_attr_setdetachstate
			(&attr_work, PTHREAD_CREATE_DETACHED);
		if (pthread_create(&thread_work, &attr_work, _node_update,
				   (void *) node_name)) {
			_node_update((void *) node_name);
		}
		slurm_attr_destroy(&attr_work);
	}

	/* Wait for work threads to complete */
	slurm_mutex_lock(&thread_cnt_mutex);
	while (1) {
		if (thread_cnt == 0)
			break;
		else	/* wait for state change and retry */
			pthread_cond_wait(&thread_cnt_cond, &thread_cnt_mutex);
	}
	slurm_mutex_unlock(&thread_cnt_mutex);
	hostlist_destroy(hl);
	xfree(mcdram_mode);
	xfree(numa_mode);

	/* Wait for all nodes to change state to "on" */
	_wait_all_nodes_on();

	if ((argc == 3) && !syscfg_path) {
		slurm_init_update_node_msg(&node_msg);
		node_msg.node_names = argv[1];
		node_msg.features_act = argv[2];
		rc = slurm_update_node(&node_msg);
	}

	if (rc == SLURM_SUCCESS) {
		exit(0);
	} else {
		error("%s: slurm_update_node(\'%s\', \'%s\'): %s\n",
		      prog_name, argv[1], argv[2],
		      slurm_strerror(slurm_get_errno()));
		exit(1);
	}
}
예제 #5
0
파일: forward.c 프로젝트: artpol84/slurm
static void _forward_msg_internal(hostlist_t hl, hostlist_t* sp_hl,
				  forward_struct_t *fwd_struct,
				  header_t *header, int timeout,
				  int hl_count)
{
	int j;
	forward_msg_t *fwd_msg = NULL;
	char *buf = NULL, *tmp_char = NULL;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;

	if (timeout <= 0)
		/* convert secs to msec */
		timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_msg = xmalloc(sizeof(forward_msg_t));

		fwd_msg->fwd_struct = fwd_struct;

		fwd_msg->timeout = timeout;

		memcpy(&fwd_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		fwd_msg->header.version = header->version;
		fwd_msg->header.flags = header->flags;
		fwd_msg->header.msg_type = header->msg_type;
		fwd_msg->header.body_length = header->body_length;
		fwd_msg->header.ret_list = NULL;
		fwd_msg->header.ret_cnt = 0;

		if (sp_hl) {
			buf = hostlist_ranged_string_xmalloc(sp_hl[j]);
			hostlist_destroy(sp_hl[j]);
		} else {
			tmp_char = hostlist_shift(hl);
			buf = xstrdup(tmp_char);
			free(tmp_char);
		}

		forward_init(&fwd_msg->header.forward, NULL);
		fwd_msg->header.forward.nodelist = buf;
		while (pthread_create(&thread_agent, &attr_agent,
				     _forward_thread,
				     (void *)fwd_msg)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(100000);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
	}
}
예제 #6
0
extern int acct_gather_profile_startpoll(char *freq, char *freq_def)
{
	int retval = SLURM_SUCCESS;
	pthread_attr_t attr;
	int i;
	uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET;

	if (acct_gather_profile_init() < 0)
		return SLURM_ERROR;

	if (acct_gather_profile_running) {
		error("acct_gather_profile_startpoll: poll already started!");
		return retval;
	}
	acct_gather_profile_running = true;

	(*(ops.get))(ACCT_GATHER_PROFILE_RUNNING, &profile);
	xassert(profile != ACCT_GATHER_PROFILE_NOT_SET);

	for (i=0; i < PROFILE_CNT; i++) {
		memset(&acct_gather_profile_timer[i], 0,
		       sizeof(acct_gather_profile_timer_t));
		pthread_cond_init(&acct_gather_profile_timer[i].notify, NULL);
		slurm_mutex_init(&acct_gather_profile_timer[i].notify_mutex);

		switch (i) {
		case PROFILE_ENERGY:
			if (!(profile & ACCT_GATHER_PROFILE_ENERGY))
				break;
			_set_freq(i, freq, freq_def);

			acct_gather_energy_startpoll(
				acct_gather_profile_timer[i].freq);
			break;
		case PROFILE_TASK:
			/* Always set up the task (always first) to be
			   done since it is used to control memory
			   consumption and such.  It will check
			   profile inside it's plugin.
			*/
			_set_freq(i, freq, freq_def);

			jobacct_gather_startpoll(
				acct_gather_profile_timer[i].freq);

			break;
		case PROFILE_FILESYSTEM:
			if (!(profile & ACCT_GATHER_PROFILE_LUSTRE))
				break;
			_set_freq(i, freq, freq_def);

			acct_gather_filesystem_startpoll(
				acct_gather_profile_timer[i].freq);
			break;
		case PROFILE_NETWORK:
			if (!(profile & ACCT_GATHER_PROFILE_NETWORK))
				break;
			_set_freq(i, freq, freq_def);

			acct_gather_infiniband_startpoll(
				acct_gather_profile_timer[i].freq);
			break;
		default:
			fatal("Unhandled profile option %d please update "
			      "slurm_acct_gather_profile.c "
			      "(acct_gather_profile_startpoll)", i);
		}
	}

	/* create polling thread */
	slurm_attr_init(&attr);
	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");

	if  (pthread_create(&timer_thread_id, &attr,
			    &_timer_thread, NULL)) {
		debug("acct_gather_profile_startpoll failed to create "
		      "_timer_thread: %m");
	} else
		debug3("acct_gather_profile_startpoll dynamic logging enabled");
	slurm_attr_destroy(&attr);

	return retval;
}
예제 #7
0
파일: slurmdbd.c 프로젝트: lindenb/slurm
/* main - slurmctld main function, start various threads and process RPCs */
int main(int argc, char *argv[])
{
	pthread_attr_t thread_attr;
	char node_name[128];
	void *db_conn = NULL;
	assoc_init_args_t assoc_init_arg;

	_init_config();
	log_init(argv[0], log_opts, LOG_DAEMON, NULL);
	if (read_slurmdbd_conf())
		exit(1);
	_parse_commandline(argc, argv);
	_update_logging(true);
	_update_nice();

	if (slurm_auth_init(NULL) != SLURM_SUCCESS) {
		fatal("Unable to initialize %s authentication plugin",
		      slurmdbd_conf->auth_type);
	}
	if (slurm_acct_storage_init(NULL) != SLURM_SUCCESS) {
		fatal("Unable to initialize %s accounting storage plugin",
		      slurmdbd_conf->storage_type);
	}
	_kill_old_slurmdbd();
	if (foreground == 0)
		_daemonize();

	/*
	 * Need to create pidfile here in case we setuid() below
	 * (init_pidfile() exits if it can't initialize pid file).
	 * On Linux we also need to make this setuid job explicitly
	 * able to write a core dump.
	 * This also has to happen after daemon(), which closes all fd's,
	 * so we keep the write lock of the pidfile.
	 */
	_init_pidfile();
	_become_slurm_user();
	if (foreground == 0)
		_set_work_dir();
	log_config();

#ifdef PR_SET_DUMPABLE
	if (prctl(PR_SET_DUMPABLE, 1) < 0)
		debug ("Unable to set dumpable to 1");
#endif /* PR_SET_DUMPABLE */

	if (xsignal_block(dbd_sigarray) < 0)
		error("Unable to block signals");

	/* Create attached thread for signal handling */
	slurm_attr_init(&thread_attr);
	if (pthread_create(&signal_handler_thread, &thread_attr,
			   _signal_handler, NULL))
		fatal("pthread_create %m");
	slurm_attr_destroy(&thread_attr);

	registered_clusters = list_create(NULL);

	slurm_attr_init(&thread_attr);
	if (pthread_create(&commit_handler_thread, &thread_attr,
			   _commit_handler, NULL))
		fatal("pthread_create %m");
	slurm_attr_destroy(&thread_attr);

	memset(&assoc_init_arg, 0, sizeof(assoc_init_args_t));

	/* If we are tacking wckey we need to cache
	   wckeys, if we aren't only cache the users, qos */
	assoc_init_arg.cache_level = ASSOC_MGR_CACHE_USER | ASSOC_MGR_CACHE_QOS;
	if (slurmdbd_conf->track_wckey)
		assoc_init_arg.cache_level |= ASSOC_MGR_CACHE_WCKEY;

	db_conn = acct_storage_g_get_connection(NULL, 0, true, NULL);
	if (assoc_mgr_init(db_conn, &assoc_init_arg, errno) == SLURM_ERROR) {
		error("Problem getting cache of data");
		acct_storage_g_close_connection(&db_conn);
		goto end_it;
	}

	if (gethostname_short(node_name, sizeof(node_name)))
		fatal("getnodename: %m");

	while (1) {
		if (slurmdbd_conf->dbd_backup &&
		    (!strcmp(node_name, slurmdbd_conf->dbd_backup) ||
		     !strcmp(slurmdbd_conf->dbd_backup, "localhost"))) {
			info("slurmdbd running in background mode");
			have_control = false;
			backup = true;
			/* make sure any locks are released */
			acct_storage_g_commit(db_conn, 1);
			run_dbd_backup();
			if (!shutdown_time)
				assoc_mgr_refresh_lists(db_conn);
		} else if (slurmdbd_conf->dbd_host &&
			   (!strcmp(slurmdbd_conf->dbd_host, node_name) ||
			    !strcmp(slurmdbd_conf->dbd_host, "localhost"))) {
			backup = false;
			have_control = true;
		} else {
			fatal("This host not configured to run SlurmDBD "
			      "(%s != %s | (backup) %s)",
			      node_name, slurmdbd_conf->dbd_host,
			      slurmdbd_conf->dbd_backup);
		}

		if (!shutdown_time) {
			/* Create attached thread to process incoming RPCs */
			slurm_attr_init(&thread_attr);
			if (pthread_create(&rpc_handler_thread, &thread_attr,
					   rpc_mgr, NULL))
				fatal("pthread_create error %m");
			slurm_attr_destroy(&thread_attr);
		}

		if (!shutdown_time) {
			/* Create attached thread to do usage rollup */
			slurm_attr_init(&thread_attr);
			if (pthread_create(&rollup_handler_thread,
					   &thread_attr,
					   _rollup_handler, db_conn))
				fatal("pthread_create error %m");
			slurm_attr_destroy(&thread_attr);
		}

		/* Daemon is fully operational here */
		if (!shutdown_time || primary_resumed) {
			shutdown_time = 0;
			info("slurmdbd version %s started",
			     SLURM_VERSION_STRING);
			if (backup)
				run_dbd_backup();
		}

		_request_registrations(db_conn);
		acct_storage_g_commit(db_conn, 1);

		/* this is only ran if not backup */
		if (rollup_handler_thread)
			pthread_join(rollup_handler_thread, NULL);
		if (rpc_handler_thread)
			pthread_join(rpc_handler_thread, NULL);

		if (backup && primary_resumed) {
			shutdown_time = 0;
			info("Backup has given up control");
		}

		if (shutdown_time)
			break;
	}
	/* Daemon termination handled here */

end_it:

	if (signal_handler_thread)
		pthread_join(signal_handler_thread, NULL);
	if (commit_handler_thread)
		pthread_join(commit_handler_thread, NULL);

	acct_storage_g_commit(db_conn, 1);
	acct_storage_g_close_connection(&db_conn);

	if (slurmdbd_conf->pid_file &&
	    (unlink(slurmdbd_conf->pid_file) < 0)) {
		verbose("Unable to remove pidfile '%s': %m",
			slurmdbd_conf->pid_file);
	}

	FREE_NULL_LIST(registered_clusters);

	assoc_mgr_fini(NULL);
	slurm_acct_storage_fini();
	slurm_auth_fini();
	log_fini();
	free_slurmdbd_conf();
	exit(0);
}
예제 #8
0
파일: allocate.c 프로젝트: HPCNow/slurm
static int _fed_job_will_run(job_desc_msg_t *req,
			     will_run_response_msg_t **will_run_resp,
			     slurmdb_federation_rec_t *fed)
{
	List resp_msg_list;
	int pthread_count = 0, i;
	pthread_t *load_thread = 0;
	load_willrun_req_struct_t *load_args;
	pthread_attr_t load_attr;
	ListIterator iter;
	will_run_response_msg_t *earliest_resp = NULL;
	load_willrun_resp_struct_t *tmp_resp;
	slurmdb_cluster_rec_t *cluster;

	xassert(req);
	xassert(will_run_resp);

	slurm_attr_init(&load_attr);

	*will_run_resp = NULL;

	/* Spawn one pthread per cluster to collect job information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_attr_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *)list_next(iter))) {
		int retries = 0;
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		load_args = xmalloc(sizeof(load_willrun_req_struct_t));
		load_args->cluster       = cluster;
		load_args->req           = req;
		load_args->resp_msg_list = resp_msg_list;
		while (pthread_create(&load_thread[pthread_count], &load_attr,
				      _load_willrun_thread, (void *)load_args)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(10000);	/* sleep and retry */
		}
		pthread_count++;
	}
	list_iterator_destroy(iter);
	slurm_attr_destroy(&load_attr);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	iter = list_iterator_create(resp_msg_list);
	while ((tmp_resp = (load_willrun_resp_struct_t *)list_next(iter))) {
		if (!tmp_resp->willrun_resp_msg)
			slurm_seterrno(tmp_resp->rc);
		else if ((!earliest_resp) ||
			 (tmp_resp->willrun_resp_msg->start_time <
			  earliest_resp->start_time)) {
			slurm_free_will_run_response_msg(earliest_resp);
			earliest_resp = tmp_resp->willrun_resp_msg;
			tmp_resp->willrun_resp_msg = NULL;
		}

		slurm_free_will_run_response_msg(tmp_resp->willrun_resp_msg);
		xfree(tmp_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	*will_run_resp = earliest_resp;

	if (!earliest_resp)
		return SLURM_FAILURE;

	return SLURM_SUCCESS;
}
예제 #9
0
/* block_state_mutex should be unlocked before calling this */
extern int free_block_list(uint32_t job_id, List track_list,
			   bool destroy, bool wait)
{
	bg_record_t *bg_record = NULL;
	int retries;
	ListIterator itr = NULL;
	bg_free_block_list_t *bg_free_list;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;

	if (!track_list || !list_count(track_list))
		return SLURM_SUCCESS;

	bg_free_list = xmalloc(sizeof(bg_free_block_list_t));
	bg_free_list->track_list = list_create(NULL);
	bg_free_list->destroy = destroy;
	bg_free_list->job_id = job_id;

	slurm_mutex_lock(&block_state_mutex);
	list_transfer(bg_free_list->track_list, track_list);
	itr = list_iterator_create(bg_free_list->track_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was already destroyed %p", bg_record);
			continue;
		}
		bg_record->free_cnt++;

		if (bg_record->job_ptr
		    && !IS_JOB_FINISHED(bg_record->job_ptr)) {
			info("We are freeing a block (%s) that has job %u(%u).",
			     bg_record->bg_block_id,
			     bg_record->job_ptr->job_id,
			     bg_record->job_running);
			/* This is not thread safe if called from
			   bg_job_place.c anywhere from within
			   submit_job() or at startup. */
			slurm_mutex_unlock(&block_state_mutex);
			bg_requeue_job(bg_record->job_ptr->job_id, 0);
			slurm_mutex_lock(&block_state_mutex);
		}
		if (remove_from_bg_list(bg_lists->job_running, bg_record)
		    == SLURM_SUCCESS)
			num_unused_cpus += bg_record->cpu_cnt;
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (wait) {
		/* Track_freeing_blocks waits until the list is done
		   and frees the memory of bg_free_list.
		*/
		_track_freeing_blocks(bg_free_list);
		return SLURM_SUCCESS;
	}

	/* _track_freeing_blocks handles cleanup */
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	retries = 0;
	while (pthread_create(&thread_agent, &attr_agent,
			      _track_freeing_blocks,
			      bg_free_list)) {
		error("pthread_create error %m");
		if (++retries > MAX_PTHREAD_RETRIES)
			fatal("Can't create "
			      "pthread");
		/* sleep and retry */
		usleep(1000);
	}
	slurm_attr_destroy(&attr_agent);
	return SLURM_SUCCESS;
}
예제 #10
0
/*
 * init() is called when the plugin is loaded, before any other functions
 * are called.  Put global initialization here.
 */
int init ( void )
{
	pthread_attr_t thread_attr;
	char *temp = NULL;

	/* This means we aren't running from the controller so skip setup. */
	if (cluster_cpus == NO_VAL)
		return SLURM_SUCCESS;

	_internal_setup();

	/* Check to see if we are running a supported accounting plugin */
	temp = slurm_get_accounting_storage_type();
	if (strcasecmp(temp, "accounting_storage/slurmdbd")
	    && strcasecmp(temp, "accounting_storage/mysql")) {
		error("You are not running a supported "
		      "accounting_storage plugin\n(%s).\n"
		      "Fairshare can only be calculated with either "
		      "'accounting_storage/slurmdbd' "
		      "or 'accounting_storage/mysql' enabled.  "
		      "If you want multifactor priority without fairshare "
		      "ignore this message.",
		      temp);
		calc_fairshare = 0;
		weight_fs = 0;
	} else if (assoc_mgr_root_assoc) {
		if (!cluster_cpus)
			fatal("We need to have a cluster cpu count "
			      "before we can init the priority/multifactor "
			      "plugin");
		assoc_mgr_root_assoc->usage->usage_efctv = 1.0;
		slurm_attr_init(&thread_attr);
		if (pthread_create(&decay_handler_thread, &thread_attr,
				   _decay_thread, NULL))
			fatal("pthread_create error %m");

		/* This is here to join the decay thread so we don't core
		   dump if in the sleep, since there is no other place to join
		   we have to create another thread to do it.
		*/
		slurm_attr_init(&thread_attr);
		if (pthread_create(&cleanup_handler_thread, &thread_attr,
				   _cleanup_thread, NULL))
			fatal("pthread_create error %m");

		slurm_attr_destroy(&thread_attr);
	} else {
		if (weight_fs)
			fatal("It appears you don't have any association "
			      "data from your database.  "
			      "The priority/multifactor plugin requires "
			      "this information to run correctly.  Please "
			      "check your database connection and try again.");

		calc_fairshare = 0;
	}

	xfree(temp);

	debug("%s loaded", plugin_name);
	return SLURM_SUCCESS;
}
예제 #11
0
/*
 * basil_request - issue BASIL request and parse response
 * @bp:	method-dependent parse data to guide the parsing process
 *
 * Returns 0 if ok, a negative %basil_error otherwise.
 */
int basil_request(struct basil_parse_data *bp)
{
	int to_child, from_child;
	int ec, i, rc = -BE_UNKNOWN;
	FILE *apbasil;
	pid_t pid = -1;
	pthread_t thread;
	pthread_attr_t attr;
	int time_it_out = 1;
	DEF_TIMERS;

	if (log_sel == -1)
		_init_log_config();

	if (!cray_conf->apbasil) {
		error("No alps client defined");
		return 0;
	}

	if ((cray_conf->apbasil_timeout == 0) ||
	    (cray_conf->apbasil_timeout == (uint16_t) NO_VAL)) {
		debug2("No ApbasilTimeout configured (%u)",
		       cray_conf->apbasil_timeout);
		time_it_out = 0;
	} else {
		slurm_attr_init(&attr);
		pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
	}

	assert(bp->version < BV_MAX);
	assert(bp->method > BM_none && bp->method < BM_MAX);

	START_TIMER;
	for (i = 0; ((i < 10) && (pid < 0)); i++) {
		if (i)
			usleep(100000);
		pid = popen2(cray_conf->apbasil, &to_child, &from_child, true);
	}
	if (pid < 0)
		fatal("popen2(\"%s\", ...)", cray_conf->apbasil);

	if (time_it_out) {
		pthread_create(&thread, &attr, _timer_func, (void*)&pid);
	}

	/* write out request */
	apbasil = fdopen(to_child, "w");
	if (apbasil == NULL)
		fatal("fdopen(): %s", strerror(errno));
	setlinebuf(apbasil);

	_write_xml(apbasil, "<?xml version=\"1.0\"?>\n"
		   "<BasilRequest protocol=\"%s\" method=\"%s\" ",
		   bv_names[bp->version], bm_names[bp->method]);

	switch (bp->method) {
	case BM_engine:
		_write_xml(apbasil, "type=\"ENGINE\"/>");
		break;
	case BM_inventory:
		_write_xml(apbasil, "type=\"INVENTORY\"/>");
		break;
	case BM_reserve:
		_write_xml(apbasil, ">\n");
		_rsvn_write_reserve_xml(apbasil, bp->mdata.res, bp->version);
		break;
	case BM_confirm:
		if (bp->version == BV_1_0 && *bp->mdata.res->batch_id != '\0')
			_write_xml(apbasil, "job_name=\"%s\" ",
				   bp->mdata.res->batch_id);
		_write_xml(apbasil, "reservation_id=\"%u\" %s=\"%llu\"/>\n",
			   bp->mdata.res->rsvn_id,
			   bp->version >= BV_3_1 ? "pagg_id" : "admin_cookie",
			   (unsigned long long)bp->mdata.res->pagg_id);
		break;
	case BM_release:
		_write_xml(apbasil, "reservation_id=\"%u\"/>\n",
			   bp->mdata.res->rsvn_id);
		break;
	case BM_switch:
	{
		char *suspend = bp->mdata.res->suspended ? "OUT" : "IN";
		_write_xml(apbasil, ">\n");
		_write_xml(apbasil, " <ReservationArray>\n");
		_write_xml(apbasil, "  <Reservation reservation_id=\"%u\" "
			   "action=\"%s\"/>\n",
			   bp->mdata.res->rsvn_id, suspend);
		_write_xml(apbasil, " </ReservationArray>\n");
		_write_xml(apbasil, "</BasilRequest>\n");
	}
		break;
	default: /* ignore BM_none, BM_MAX, and BM_UNKNOWN covered above */
		break;
	}

	if (fclose(apbasil) < 0)	/* also closes to_child */
		error("fclose(apbasil): %s", strerror(errno));

	rc = parse_basil(bp, from_child);
	ec = wait_for_child(pid);

	if (time_it_out) {
		slurm_attr_destroy(&attr);
		debug2("Killing the timer thread.");
		pthread_mutex_lock(&timer_lock);
		pthread_cond_broadcast(&timer_cond);
		pthread_mutex_unlock(&timer_lock);
	}

	END_TIMER;
	if (ec) {
		error("%s child process for BASIL %s method exited with %d",
		      cray_conf->apbasil, bm_names[bp->method], ec);
	} else if (DELTA_TIMER > 5000000) {	/* 5 seconds limit */
		info("%s child process for BASIL %s method time %s",
		     cray_conf->apbasil, bm_names[bp->method], TIME_STR);
	}

	return rc;
}
예제 #12
0
static void *_agent(void *x)
{
	struct agent_arg *args = (struct agent_arg *) x;
	kvs_comm_set_t *kvs_set;
	struct msg_arg *msg_args;
	struct kvs_hosts *kvs_host_list;
	int i, j, kvs_set_cnt = 0, host_cnt, pmi_fanout = 32;
	int msg_sent = 0, max_forward = 0;
	char *tmp, *fanout_off_host;
	pthread_t msg_id;
	pthread_attr_t attr;
	DEF_TIMERS;

	tmp = getenv("PMI_FANOUT");
	if (tmp) {
		pmi_fanout = atoi(tmp);
		if (pmi_fanout < 1)
			pmi_fanout = 32;
	}
	fanout_off_host = getenv("PMI_FANOUT_OFF_HOST");

	/* only send one message to each host,
	 * build table of the ports on each host */
	START_TIMER;
	slurm_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
	kvs_set = xmalloc(sizeof(kvs_comm_set_t) * args->barrier_xmit_cnt);
	for (i=0; i<args->barrier_xmit_cnt; i++) {
		if (args->barrier_xmit_ptr[i].port == 0)
			continue;	/* already sent message to host */
		kvs_host_list = xmalloc(sizeof(struct kvs_hosts) * pmi_fanout);
		host_cnt = 0;

		/* This code enables key-pair forwarding between
		 * tasks. First task on the node gets the key-pairs
		 * with host/port information for all other tasks on
		 * that node it should forward the information to. */
		for (j=(i+1); j<args->barrier_xmit_cnt; j++) {
			if (args->barrier_xmit_ptr[j].port == 0)
				continue;	/* already sent message */
			if ((fanout_off_host == NULL) &&
			    strcmp(args->barrier_xmit_ptr[i].hostname,
				   args->barrier_xmit_ptr[j].hostname))
				continue;	/* another host */
			kvs_host_list[host_cnt].task_id = 0; /* not avail */
			kvs_host_list[host_cnt].port =
					args->barrier_xmit_ptr[j].port;
			kvs_host_list[host_cnt].hostname =
					args->barrier_xmit_ptr[j].hostname;
			args->barrier_xmit_ptr[j].port = 0;/* don't reissue */
			host_cnt++;
			if (host_cnt >= pmi_fanout)
				break;
		}

		msg_sent++;
		max_forward = MAX(host_cnt, max_forward);

		slurm_mutex_lock(&agent_mutex);
		while (agent_cnt >= agent_max_cnt)
			pthread_cond_wait(&agent_cond, &agent_mutex);
		agent_cnt++;
		slurm_mutex_unlock(&agent_mutex);

		msg_args = xmalloc(sizeof(struct msg_arg));
		msg_args->bar_ptr = &args->barrier_xmit_ptr[i];
		msg_args->kvs_ptr = &kvs_set[kvs_set_cnt];
		kvs_set[kvs_set_cnt].host_cnt      = host_cnt;
		kvs_set[kvs_set_cnt].kvs_host_ptr  = kvs_host_list;
		kvs_set[kvs_set_cnt].kvs_comm_recs = args->kvs_xmit_cnt;
		kvs_set[kvs_set_cnt].kvs_comm_ptr  = args->kvs_xmit_ptr;
		kvs_set_cnt++;
		if (agent_max_cnt == 1) {
			/* TotalView slows down a great deal for
			 * pthread_create() calls, so just send the
			 * messages inline when TotalView is in use
			 * or for some other reason we only want
			 * one pthread. */
			_msg_thread((void *) msg_args);
		} else if (pthread_create(&msg_id, &attr, _msg_thread,
				(void *) msg_args)) {
			fatal("pthread_create: %m");
		}
	}

	verbose("Sent KVS info to %d nodes, up to %d tasks per node",
		msg_sent, (max_forward+1));

	/* wait for completion of all outgoing message */
	slurm_mutex_lock(&agent_mutex);
	while (agent_cnt > 0)
		pthread_cond_wait(&agent_cond, &agent_mutex);
	slurm_mutex_unlock(&agent_mutex);
	slurm_attr_destroy(&attr);

	/* Release allocated memory */
	for (i=0; i<kvs_set_cnt; i++)
		xfree(kvs_set[i].kvs_host_ptr);
	xfree(kvs_set);
	for (i=0; i<args->barrier_xmit_cnt; i++)
		xfree(args->barrier_xmit_ptr[i].hostname);
	xfree(args->barrier_xmit_ptr);
	for (i=0; i<args->kvs_xmit_cnt; i++) {
		for (j=0; j<args->kvs_xmit_ptr[i]->kvs_cnt; j++) {
			xfree(args->kvs_xmit_ptr[i]->kvs_keys[j]);
			xfree(args->kvs_xmit_ptr[i]->kvs_values[j]);
		}
		xfree(args->kvs_xmit_ptr[i]->kvs_keys);
		xfree(args->kvs_xmit_ptr[i]->kvs_values);
		xfree(args->kvs_xmit_ptr[i]->kvs_name);
		xfree(args->kvs_xmit_ptr[i]);
	}
	xfree(args->kvs_xmit_ptr);
	xfree(args);

	END_TIMER;
	debug("kvs_xmit time %ld usec", DELTA_TIMER);
	return NULL;
}
예제 #13
0
static int _load_fed_parts(slurm_msg_t *req_msg,
			   partition_info_msg_t **part_info_msg_pptr,
			   uint16_t show_flags, char *cluster_name,
			   slurmdb_federation_rec_t *fed)
{
	int cluster_inx = 0, i;
	load_part_resp_struct_t *part_resp;
	partition_info_msg_t *orig_msg = NULL, *new_msg = NULL;
	uint32_t new_rec_cnt;
	slurmdb_cluster_rec_t *cluster;
	ListIterator iter;
	pthread_attr_t load_attr;
	int pthread_count = 0;
	pthread_t *load_thread = 0;
	load_part_req_struct_t *load_args;
	List resp_msg_list;

	*part_info_msg_pptr = NULL;

	/* Spawn one pthread per cluster to collect partition information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_attr_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) {
		int retries = 0;
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		load_args = xmalloc(sizeof(load_part_req_struct_t));
		load_args->cluster = cluster;
		load_args->cluster_inx = cluster_inx++;
		load_args->req_msg = req_msg;
		load_args->resp_msg_list = resp_msg_list;
		load_args->show_flags = show_flags;
		slurm_attr_init(&load_attr);
		if (pthread_attr_setdetachstate(&load_attr,
						PTHREAD_CREATE_JOINABLE))
			error("pthread_attr_setdetachstate error %m");
		while (pthread_create(&load_thread[pthread_count], &load_attr,
				      _load_part_thread, (void *) load_args)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(10000);	/* sleep and retry */
		}
		pthread_count++;
		slurm_attr_destroy(&load_attr);

	}
	list_iterator_destroy(iter);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	/* Maintain a consistent cluster/node ordering */
	list_sort(resp_msg_list, _sort_by_cluster_inx);

	/* Merge the responses into a single response message */
	iter = list_iterator_create(resp_msg_list);
	while ((part_resp = (load_part_resp_struct_t *) list_next(iter))) {
		new_msg = part_resp->new_msg;
		if (!orig_msg) {
			orig_msg = new_msg;
			*part_info_msg_pptr = orig_msg;
		} else {
			/* Merge the node records */
			orig_msg->last_update = MIN(orig_msg->last_update,
						    new_msg->last_update);
			new_rec_cnt = orig_msg->record_count +
				      new_msg->record_count;
			if (new_msg->record_count) {
				orig_msg->partition_array =
					xrealloc(orig_msg->partition_array,
						 sizeof(partition_info_t) *
						 new_rec_cnt);
				(void) memcpy(orig_msg->partition_array +
					      orig_msg->record_count,
					      new_msg->partition_array,
					      sizeof(partition_info_t) *
					      new_msg->record_count);
				orig_msg->record_count = new_rec_cnt;
			}
			xfree(new_msg->partition_array);
			xfree(new_msg);
		}
		xfree(part_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	if (!orig_msg)
		slurm_seterrno_ret(SLURM_ERROR);

	return SLURM_PROTOCOL_SUCCESS;
}
예제 #14
0
static void _xlate_before(char *depend, uint32_t submit_uid, uint32_t my_job_id)
{
	uint32_t job_id;
	char *last_ptr = NULL, *new_dep = NULL, *tok, *type;
	struct job_record *job_ptr;
        pthread_attr_t attr;
	pthread_t dep_thread;


	tok = strtok_r(depend, ":", &last_ptr);
	if (!xstrcmp(tok, "before"))
		type = "after";
	else if (!xstrcmp(tok, "beforeany"))
		type = "afterany";
	else if (!xstrcmp(tok, "beforenotok"))
		type = "afternotok";
	else if (!xstrcmp(tok, "beforeok"))
		type = "afterok";
	else {
		info("%s: discarding invalid job dependency option %s",
		     plugin_type, tok);
		return;
	}

	/* NOTE: We are updating a job record here in order to implement
	 * the depend=before option. We are doing so without the write lock
	 * on the job record, but using a local mutex to prevent multiple
	 * updates on the same job when multiple jobs satisfying the dependency
	 * are being processed at the same time (all with read locks). The
	 * job read lock will prevent anyone else from getting a job write
	 * lock and using a job write lock causes serious performance problems
	 * for slow job_submit plugins. Not an ideal solution, but the best
	 * option that we see. */
	slurm_mutex_lock(&depend_mutex);
	tok = strtok_r(NULL, ":", &last_ptr);
	while (tok) {
		job_id = atoi(tok);
		job_ptr = find_job_record(job_id);
		if (!job_ptr) {
			info("%s: discarding invalid job dependency before %s",
			     plugin_type, tok);
		} else if ((submit_uid != job_ptr->user_id) &&
			   !validate_super_user(submit_uid)) {
			error("%s: Security violation: uid %u trying to alter "
			      "job %u belonging to uid %u", 
			      plugin_type, submit_uid, job_ptr->job_id,
			      job_ptr->user_id);
		} else if ((!IS_JOB_PENDING(job_ptr)) ||
			   (job_ptr->details == NULL)) {
			info("%s: discarding job before dependency on "
			     "non-pending job %u",
			     plugin_type, job_ptr->job_id);
		} else {
			if (job_ptr->details->dependency) {
				xstrcat(new_dep, job_ptr->details->dependency);
				xstrcat(new_dep, ",");
			}
			xstrfmtcat(new_dep, "%s:%u", type, my_job_id);
			xfree(job_ptr->details->dependency);
			job_ptr->details->dependency = new_dep;
			new_dep = NULL;
			_decr_depend_cnt(job_ptr);

			slurm_attr_init(&attr);
			pthread_attr_setdetachstate(&attr,
						    PTHREAD_CREATE_DETACHED);
			pthread_create(&dep_thread, &attr, _dep_agent, job_ptr);
			slurm_attr_destroy(&attr);
		}
		tok = strtok_r(NULL, ":", &last_ptr);
	}
	slurm_mutex_unlock(&depend_mutex);
}
예제 #15
0
파일: capmc_suspend.c 프로젝트: A1ve5/slurm
int main(int argc, char *argv[])
{
	log_options_t log_opts = LOG_OPTS_INITIALIZER;
	hostlist_t hl = NULL;
	char *node_name;
	pthread_attr_t attr_work;
	pthread_t thread_work = 0;

	xstrfmtcat(prog_name, "%s[%u]", argv[0], (uint32_t) getpid());
	_read_config();
	log_opts.stderr_level = LOG_LEVEL_QUIET;
	log_opts.syslog_level = LOG_LEVEL_QUIET;
	if (slurm_get_debug_flags() && DEBUG_FLAG_NODE_FEATURES)
		log_opts.logfile_level += 3;
	(void) log_init(argv[0], log_opts, LOG_DAEMON, log_file);

	/* Attempt to shutdown all nodes in a single capmc call,
	 * attempt to shutdown individual nodes only if that fails. */
	if (_update_all_nodes(argv[1]) != 0) {
		if ((hl = hostlist_create(argv[1])) == NULL) {
			error("%s: Invalid hostlist (%s)", prog_name, argv[1]);
			exit(2);
		}
		while ((node_name = hostlist_pop(hl))) {
			slurm_mutex_lock(&thread_cnt_mutex);
			while (1) {
				if (thread_cnt <= MAX_THREADS) {
					thread_cnt++;
					break;
				} else {   /* wait for state change and retry */
					pthread_cond_wait(&thread_cnt_cond,
							  &thread_cnt_mutex);
				}
			}
			slurm_mutex_unlock(&thread_cnt_mutex);

			slurm_attr_init(&attr_work);
			(void) pthread_attr_setdetachstate
				(&attr_work, PTHREAD_CREATE_DETACHED);
			if (pthread_create(&thread_work, &attr_work,
					    _node_update, (void *) node_name)) {
				_node_update((void *) node_name);
			}
			slurm_attr_destroy(&attr_work);
		}
		hostlist_destroy(hl);
	}

	/* Wait for work threads to complete */
	slurm_mutex_lock(&thread_cnt_mutex);
	while (1) {
		if (thread_cnt == 0)
			break;
		else	/* wait for state change and retry */
			pthread_cond_wait(&thread_cnt_cond, &thread_cnt_mutex);
	}
	slurm_mutex_unlock(&thread_cnt_mutex);

	xfree(prog_name);
	exit(0);
}
예제 #16
0
/*
 * _build_sinfo_data - make a sinfo_data entry for each unique node
 *	configuration and add it to the sinfo_list for later printing.
 * sinfo_list IN/OUT - list of unique sinfo_data records to report
 * partition_msg IN - partition info message
 * node_msg IN - node info message
 * RET zero or error code
 */
static int _build_sinfo_data(List sinfo_list,
			     partition_info_msg_t *partition_msg,
			     node_info_msg_t *node_msg)
{
	pthread_attr_t attr_sinfo;
	pthread_t thread_sinfo;
	build_part_info_t *build_struct_ptr;
	node_info_t *node_ptr = NULL;
	partition_info_t *part_ptr = NULL;
	int j;

	g_node_scaling = node_msg->node_scaling;

	/* by default every partition is shown, even if no nodes */
	if ((!params.node_flag) && params.match_flags.partition_flag) {
		part_ptr = partition_msg->partition_array;
		for (j=0; j<partition_msg->record_count; j++, part_ptr++) {
			if ((!params.partition) ||
			    (_strcmp(params.partition, part_ptr->name) == 0)) {
				list_append(sinfo_list, _create_sinfo(
						    part_ptr, (uint16_t) j,
						    NULL,
						    node_msg->node_scaling));
			}
		}
	}

	if (params.filtering) {
		for (j = 0; j < node_msg->record_count; j++) {
			node_ptr = &(node_msg->node_array[j]);
			if (node_ptr->name && _filter_out(node_ptr))
				xfree(node_ptr->name);
		}
	}

	/* make sinfo_list entries for every node in every partition */
	for (j=0; j<partition_msg->record_count; j++, part_ptr++) {
		part_ptr = &(partition_msg->partition_array[j]);

		if (params.filtering && params.partition &&
		    _strcmp(part_ptr->name, params.partition))
			continue;

		if (node_msg->record_count == 1) { /* node_name_single */
			int pos = -1;
			uint16_t subgrp_size = 0;
			hostlist_t hl;

			node_ptr = &(node_msg->node_array[0]);
			if ((node_ptr->name == NULL) ||
			    (part_ptr->nodes == NULL))
				continue;
			hl = hostlist_create(part_ptr->nodes);
			pos = hostlist_find(hl, node_msg->node_array[0].name);
			hostlist_destroy(hl);
			if (pos < 0)
				continue;
			if (select_g_select_nodeinfo_get(
				   node_ptr->select_nodeinfo,
				   SELECT_NODEDATA_SUBGRP_SIZE,
				   0,
				   &subgrp_size) == SLURM_SUCCESS
			    && subgrp_size) {
				_handle_subgrps(sinfo_list,
						(uint16_t) j,
						part_ptr,
						node_ptr,
						node_msg->
						node_scaling);
			} else {
				_insert_node_ptr(sinfo_list,
						 (uint16_t) j,
						 part_ptr,
						 node_ptr,
						 node_msg->
						 node_scaling);
			}
			continue;
		}

		/* Process each partition using a separate thread */
		build_struct_ptr = xmalloc(sizeof(build_part_info_t));
		build_struct_ptr->node_msg   = node_msg;
		build_struct_ptr->part_num   = (uint16_t) j;
		build_struct_ptr->part_ptr   = part_ptr;
		build_struct_ptr->sinfo_list = sinfo_list;

		slurm_mutex_lock(&sinfo_cnt_mutex);
		sinfo_cnt++;
		slurm_mutex_unlock(&sinfo_cnt_mutex);

		slurm_attr_init(&attr_sinfo);
		if (pthread_attr_setdetachstate
		    (&attr_sinfo, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");
		while (pthread_create(&thread_sinfo, &attr_sinfo,
				      _build_part_info,
				      (void *) build_struct_ptr)) {
			error("pthread_create error %m");
			usleep(10000);	/* sleep and retry */
		}
		slurm_attr_destroy(&attr_sinfo);
	}

	slurm_mutex_lock(&sinfo_cnt_mutex);
	while (sinfo_cnt) {
		pthread_cond_wait(&sinfo_cnt_cond, &sinfo_cnt_mutex);
	}
	slurm_mutex_unlock(&sinfo_cnt_mutex);

	_sort_hostlist(sinfo_list);
	return SLURM_SUCCESS;
}
예제 #17
0
/* Process incoming RPCs. Meant to execute as a pthread */
extern void *rpc_mgr(void *no_data)
{
	pthread_attr_t thread_attr_rpc_req;
	slurm_fd_t sockfd, newsockfd;
	int i, retry_cnt, sigarray[] = {SIGUSR1, 0};
	slurm_addr_t cli_addr;
	slurmdbd_conn_t *conn_arg = NULL;

	slurm_mutex_lock(&thread_count_lock);
	master_thread_id = pthread_self();
	slurm_mutex_unlock(&thread_count_lock);

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	/* threads to process individual RPC's are detached */
	slurm_attr_init(&thread_attr_rpc_req);
	if (pthread_attr_setdetachstate
	    (&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED))
		fatal("pthread_attr_setdetachstate %m");

	/* initialize port for RPCs */
	if ((sockfd = slurm_init_msg_engine_port(get_dbd_port()))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_port error %m");

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmdbd signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	/*
	 * Process incoming RPCs until told to shutdown
	 */
	while ((i = _wait_for_server_thread()) >= 0) {
		/*
		 * accept needed for stream implementation is a no-op in
		 * message implementation that just passes sockfd to newsockfd
		 */
		if ((newsockfd = slurm_accept_msg_conn(sockfd,
						       &cli_addr)) ==
		    SLURM_SOCKET_ERROR) {
			_free_server_thread((pthread_t) 0);
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}
		fd_set_nonblocking(newsockfd);

		conn_arg = xmalloc(sizeof(slurmdbd_conn_t));
		conn_arg->newsockfd = newsockfd;
		slurm_get_ip_str(&cli_addr, &conn_arg->orig_port,
				 conn_arg->ip, sizeof(conn_arg->ip));
		retry_cnt = 0;
		while (pthread_create(&slave_thread_id[i],
				      &thread_attr_rpc_req,
				      _service_connection,
				      (void *) conn_arg)) {
			if (retry_cnt > 0) {
				error("pthread_create failure, "
				      "aborting RPC: %m");
				close(newsockfd);
				break;
			}
			error("pthread_create failure: %m");
			retry_cnt++;
			usleep(1000);	/* retry in 1 msec */
		}
	}

	debug3("rpc_mgr shutting down");
	slurm_attr_destroy(&thread_attr_rpc_req);
	(void) slurm_shutdown_msg_engine(sockfd);
	_wait_for_thread_fini();
	pthread_exit((void *) 0);
	return NULL;
}
예제 #18
0
파일: mpich1_p4.c 프로젝트: VURM/slurm
mpi_plugin_client_state_t *
p_mpi_hook_client_prelaunch(mpi_plugin_client_info_t *job, char ***env)
{
	struct sockaddr_in sin;
	pthread_attr_t attr;
	socklen_t len = sizeof(sin);
	short port1, port2;

	debug("Using mpi/mpich1_p4");
	if ((p4_fd1 = socket(PF_INET, SOCK_DGRAM, 0)) < 0) {
		error("socket: %m");
		return NULL;
	}
	memset(&sin, 0, sizeof(sin));
	sin.sin_family = PF_INET;
	if (bind(p4_fd1, (struct sockaddr *) &sin, len) < 0) {
		error("bind: %m");
		return NULL;
	}
	if (getsockname(p4_fd1, (struct sockaddr *) &sin, &len) < 0) {
		error("getsockname: %m");
		return NULL;
	}
	port1 = ntohs(sin.sin_port);

	if ((p4_fd2 = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
		error("socket: %m");
		return NULL;
	}
	memset(&sin, 0, sizeof(sin));
	sin.sin_family = PF_INET;
	sin.sin_addr.s_addr = htonl(INADDR_ANY);
	if (bind(p4_fd2, (struct sockaddr *) &sin, len) < 0) {
		error("bind: %m");
		return NULL;
	}
	if (listen(p4_fd2, 64) < 0)
		error("listen: %m");
	if (getsockname(p4_fd2, (struct sockaddr *) &sin, &len) < 0) {
		error("getsockname: %m");
		return NULL;
	}
	port2 = ntohs(sin.sin_port);

	if (pipe(shutdown_pipe) < 0) {
		error ("pipe: %m");
		return (NULL);
	}
	shutdown_complete = false;
	shutdown_timeout = 5;
	slurm_mutex_init(&shutdown_lock);
	pthread_cond_init(&shutdown_cond, NULL);

	/* Process messages in a separate thread */
	slurm_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
	if (pthread_create(&p4_tid, &attr, &mpich1_thr, NULL)) {
		error("pthread_create: %m");
		slurm_attr_destroy(&attr);
		return NULL;
	}
	slurm_attr_destroy(&attr);
	env_array_overwrite_fmt(env, "SLURM_MPICH_PORT1", "%hu", port1);
	env_array_overwrite_fmt(env, "SLURM_MPICH_PORT2", "%hu", port2);
	debug("mpich_p4 plugin listening on fd=%d,%d ports=%d,%d",
		p4_fd1, p4_fd2, port1, port2);

	/* only return NULL on error */
	return (void *)0xdeadbeef;
}
예제 #19
0
파일: req.c 프로젝트: VURM/slurm
static int
_msg_socket_accept(eio_obj_t *obj, List objs)
{
	slurmd_job_t *job = (slurmd_job_t *)obj->arg;
	int fd;
	struct sockaddr_un addr;
	int len = sizeof(addr);
	struct request_params *param = NULL;
	pthread_attr_t attr;
	pthread_t id;
	int retries = 0;

	debug3("Called _msg_socket_accept");

	while ((fd = accept(obj->fd, (struct sockaddr *)&addr,
			    (socklen_t *)&len)) < 0) {
		if (errno == EINTR)
			continue;
		if (errno == EAGAIN
		    || errno == ECONNABORTED
		    || errno == EWOULDBLOCK) {
			return SLURM_SUCCESS;
		}
		error("Error on msg accept socket: %m");
		obj->shutdown = true;
		return SLURM_SUCCESS;
	}

	pthread_mutex_lock(&message_lock);
	message_connections++;
	pthread_mutex_unlock(&message_lock);

	fd_set_close_on_exec(fd);
	fd_set_blocking(fd);

	slurm_attr_init(&attr);
	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0) {
		error("Unable to set detachstate on attr: %m");
		slurm_attr_destroy(&attr);
		close(fd);
		return SLURM_ERROR;
	}

	param = xmalloc(sizeof(struct request_params));
	param->fd = fd;
	param->job = job;
	while (pthread_create(&id, &attr, &_handle_accept, (void *)param)) {
		error("stepd_api message engine pthread_create: %m");
		if (++retries > MAX_RETRIES) {
			error("running handle_accept without "
			      "starting a thread stepd will be "
			      "unresponsive until done");
			_handle_accept((void *)param);
			info("stepd should be responsive now");
			break;
		}
		usleep(10);	/* sleep and again */
	}

	slurm_attr_destroy(&attr);
	param = NULL;

	debug3("Leaving _msg_socket_accept");
	return SLURM_SUCCESS;
}
예제 #20
0
extern int as_mysql_roll_usage(mysql_conn_t *mysql_conn,
			       time_t sent_start, time_t sent_end,
			       uint16_t archive_data)
{
	int rc = SLURM_SUCCESS;
	int rolledup = 0;
	char *cluster_name = NULL;
	ListIterator itr;
	pthread_mutex_t rolledup_lock = PTHREAD_MUTEX_INITIALIZER;
	pthread_cond_t rolledup_cond;
	//DEF_TIMERS;

	if (check_connection(mysql_conn) != SLURM_SUCCESS)
		return ESLURM_DB_CONNECTION;

	slurm_mutex_lock(&usage_rollup_lock);

	slurm_mutex_init(&rolledup_lock);
	pthread_cond_init(&rolledup_cond, NULL);

	//START_TIMER;
	slurm_mutex_lock(&as_mysql_cluster_list_lock);
	itr = list_iterator_create(as_mysql_cluster_list);
	while ((cluster_name = list_next(itr))) {
		pthread_t rollup_tid;
		pthread_attr_t rollup_attr;
		local_rollup_t *local_rollup = xmalloc(sizeof(local_rollup_t));

		local_rollup->archive_data = archive_data;
		local_rollup->cluster_name = cluster_name;

		local_rollup->mysql_conn = mysql_conn;
		local_rollup->rc = &rc;
		local_rollup->rolledup = &rolledup;
		local_rollup->rolledup_lock = &rolledup_lock;
		local_rollup->rolledup_cond = &rolledup_cond;

		local_rollup->sent_end = sent_end;
		local_rollup->sent_start = sent_start;

		/* _cluster_rollup_usage is responsible for freeing
		   this local_rollup */
		/* If you have many jobs in your system the
		 * _cluster_rollup_usage call takes up a bunch of time
		 * and all the while the as_mysql_cluster_list_lock is
		 * locked.  If a slurmctld is starting up while this
		 * is locked it will hang waiting to get information
		 * from the DBD.  So threading this makes a lot of
		 * sense.  While it only buys a very small victory in
		 * terms of speed, having the
		 * as_mysql_cluster_list_lock lock unlock in a timely
		 * fashion buys a bunch on systems with lots
		 * (millions) of jobs.
		 */
		slurm_attr_init(&rollup_attr);
		if (pthread_create(&rollup_tid, &rollup_attr,
				   _cluster_rollup_usage,
				   (void *)local_rollup))
			fatal("pthread_create: %m");
		slurm_attr_destroy(&rollup_attr);
	}
	slurm_mutex_lock(&rolledup_lock);
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&as_mysql_cluster_list_lock);

	while (rolledup < list_count(as_mysql_cluster_list)) {
		pthread_cond_wait(&rolledup_cond, &rolledup_lock);
		debug2("Got %d rolled up", rolledup);
	}
	slurm_mutex_unlock(&rolledup_lock);
	debug2("Everything rolled up");
	slurm_mutex_destroy(&rolledup_lock);
	pthread_cond_destroy(&rolledup_cond);
	/* END_TIMER; */
	/* info("total time was %s", TIME_STR); */

	slurm_mutex_unlock(&usage_rollup_lock);

	return rc;
}
예제 #21
0
파일: mpichmx.c 프로젝트: HPCNow/slurm
extern gmpi_state_t *
gmpi_thr_create(const mpi_plugin_client_info_t *job, char ***env)
{
	uint16_t port;
	pthread_attr_t attr;
	gmpi_state_t *st = NULL;

	st = gmpi_state_create(job);

	/*
	 * It is possible for one to modify the mpirun command in
	 * MPICH-GM distribution so that it calls srun, instead of
	 * rsh, for remote process invocations.  In that case, we
	 * should not override envs nor open the master port.
	 */
	if (getenv("GMPI_PORT"))
		return st;

	if (net_stream_listen (&st->fd, &port) < 0) {
		error ("Unable to create GMPI listen port: %m");
		gmpi_state_destroy(st);
		return NULL;
	}

	/*
	 * Accept in a separate thread.
	 */
	slurm_attr_init(&attr);
	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
	if (pthread_create(&st->tid, &attr, &_gmpi_thr, (void *)st)) {
		slurm_attr_destroy(&attr);
		gmpi_state_destroy(st);
		return NULL;
	}
	slurm_attr_destroy(&attr);

	env_array_overwrite_fmt(env, "GMPI_PORT",  "%hu", port);
	env_array_overwrite_fmt(env, "GMPI_MAGIC", "%u", job->jobid);
	env_array_overwrite_fmt(env, "GMPI_NP",    "%d",
				job->step_layout->task_cnt);
	env_array_overwrite_fmt(env, "GMPI_SHMEM", "1");
	/* FIXME for multi-board config. */
	env_array_overwrite_fmt(env, "GMPI_BOARD", "-1");


	/* For new MX version */
	env_array_overwrite_fmt(env, "MXMPI_PORT",  "%hu", port);
	env_array_overwrite_fmt(env, "MXMPI_MAGIC", "%u", job->jobid);
	env_array_overwrite_fmt(env, "MXMPI_NP",    "%d",
				job->step_layout->task_cnt);
	/* FIXME for multi-board config. */
	env_array_overwrite_fmt(env, "MXMPI_BOARD", "-1");


	/* for MACOSX to override default malloc */
	env_array_overwrite_fmt(env, "DYLD_FORCE_FLAT_NAMESPACE", "1");


	debug("Started GMPI master thread (%lu)", (unsigned long) st->tid);

	return st;
}
예제 #22
0
파일: forward.c 프로젝트: IFCA/slurm
/*
 * forward_msg        - logic to forward a message which has been received and
 *                      accumulate the return codes from processes getting the
 *                      the forwarded message
 *
 * IN: forward_struct - forward_struct_t *   - holds information about message
 *                                             that needs to be forwarded to
 *                                             childern processes
 * IN: header         - header_t             - header from message that came in
 *                                             needing to be forwarded.
 * RET: SLURM_SUCCESS - int
 */
extern int forward_msg(forward_struct_t *forward_struct,
		       header_t *header)
{
	int j = 0;
	int retries = 0;
	forward_msg_t *forward_msg = NULL;
	int thr_count = 0;
	int *span = set_span(header->forward.cnt, 0);
	hostlist_t hl = NULL;
	hostlist_t forward_hl = NULL;
	char *name = NULL;

	if (!forward_struct->ret_list) {
		error("didn't get a ret_list from forward_struct");
		xfree(span);
		return SLURM_ERROR;
	}
	hl = hostlist_create(header->forward.nodelist);
	hostlist_uniq(hl);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		char *buf = NULL;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		forward_msg = &forward_struct->forward_msg[thr_count];
		forward_msg->ret_list = forward_struct->ret_list;

		forward_msg->timeout = forward_struct->timeout;

		if (forward_msg->timeout <= 0) {
			/* convert secs to msec */
			forward_msg->timeout  = slurm_get_msg_timeout() * 1000;
		}

		forward_msg->notify = &forward_struct->notify;
		forward_msg->forward_mutex = &forward_struct->forward_mutex;
		forward_msg->buf_len = forward_struct->buf_len;
		forward_msg->buf = forward_struct->buf;

		memcpy(&forward_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		forward_msg->header.version = header->version;
		forward_msg->header.flags = header->flags;
		forward_msg->header.msg_type = header->msg_type;
		forward_msg->header.body_length = header->body_length;
		forward_msg->header.ret_list = NULL;
		forward_msg->header.ret_cnt = 0;

		forward_hl = hostlist_create(name);
		free(name);
		for(j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(forward_hl, name);
			free(name);
		}

		buf = hostlist_ranged_string_xmalloc(forward_hl);
		hostlist_destroy(forward_hl);
		forward_init(&forward_msg->header.forward, NULL);
		forward_msg->header.forward.nodelist = buf;
		while (pthread_create(&thread_agent, &attr_agent,
				     _forward_thread,
				     (void *)forward_msg)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
		thr_count++;
	}
	hostlist_destroy(hl);
	xfree(span);
	return SLURM_SUCCESS;
}
예제 #23
0
/*
 * The remainder of this file implements the standard SLURM checkpoint API.
 */
extern int slurm_ckpt_op (uint32_t job_id, uint32_t step_id,
			  struct step_record *step_ptr, uint16_t op,
			  uint16_t data, char *image_dir, time_t * event_time,
			  uint32_t *error_code, char **error_msg )
{
	int rc = SLURM_SUCCESS;
	struct check_job_info *check_ptr;
	uint16_t done_sig = 0;
	struct job_record *job_ptr;
	struct node_record *node_ptr;
	pthread_attr_t attr;
	pthread_t ckpt_agent_tid = 0;
	char *nodelist;
	struct ckpt_req *req_ptr;

	/* job/step checked already */
	job_ptr = find_job_record(job_id);
	if (!job_ptr)
		return ESLURM_INVALID_JOB_ID;
	if (step_id == SLURM_BATCH_SCRIPT) {
		check_ptr = (struct check_job_info *)job_ptr->check_job;
		node_ptr = find_first_node_record(job_ptr->node_bitmap);
		nodelist = node_ptr->name;
	} else {
		step_ptr = find_step_record(job_ptr, step_id);
		if (!step_ptr)
			return ESLURM_INVALID_JOB_ID;
		check_ptr = (struct check_job_info *)step_ptr->check_job;
		nodelist = step_ptr->step_layout->node_list;
	}
	xassert(check_ptr);

	switch (op) {
	case CHECK_ABLE:
		if (check_ptr->disabled)
			rc = ESLURM_DISABLED;
		else {
			*event_time = check_ptr->time_stamp;
			rc = SLURM_SUCCESS;
		}
		break;
	case CHECK_DISABLE:
		check_ptr->disabled++;
		break;
	case CHECK_ENABLE:
		check_ptr->disabled--;
		break;
	case CHECK_REQUEUE:
		if (step_id != SLURM_BATCH_SCRIPT) {
			rc = ESLURM_NOT_SUPPORTED;
			break;
		}
		/* no break */
	case CHECK_VACATE:
		done_sig = SIGTERM;
		/* no break */
	case CHECK_CREATE:
		if (check_ptr->disabled) {
			rc = ESLURM_DISABLED;
			break;
		}
		if (check_ptr->time_stamp != 0) {
			rc = EALREADY;
			break;
		}

		check_ptr->time_stamp = time(NULL);
		check_ptr->error_code = 0;
		xfree(check_ptr->error_msg);

		req_ptr = xmalloc(sizeof(struct ckpt_req));
		if (!req_ptr) {
			rc = ENOMEM;
			break;
		}
		req_ptr->gid = job_ptr->group_id;
		req_ptr->uid = job_ptr->user_id;
		req_ptr->job_id = job_id;
		req_ptr->step_id = step_id;
		req_ptr->begin_time = check_ptr->time_stamp;
		req_ptr->wait = data;
		req_ptr->image_dir = xstrdup(image_dir);
		req_ptr->nodelist = xstrdup(nodelist);
		req_ptr->sig_done = done_sig;
		req_ptr->op = op;

		slurm_attr_init(&attr);
		if (pthread_attr_setdetachstate(&attr,
						PTHREAD_CREATE_DETACHED)) {
			error("pthread_attr_setdetachstate: %m");
			rc = errno;
			break;
		}

		if (pthread_create(&ckpt_agent_tid, &attr, _ckpt_agent_thr,
				   req_ptr)) {
			error("pthread_create: %m");
			rc = errno;
			break;
		}
		slurm_attr_destroy(&attr);

		break;

	case CHECK_RESTART:
		if (step_id != SLURM_BATCH_SCRIPT) {
			rc = ESLURM_NOT_SUPPORTED;
			break;
		}
		/* create a batch job from saved desc */
		rc = ESLURM_NOT_SUPPORTED;
		/* TODO: save job script */
		break;

	case CHECK_ERROR:
		xassert(error_code);
		xassert(error_msg);
		*error_code = check_ptr->error_code;
		xfree(*error_msg);
		*error_msg = xstrdup(check_ptr->error_msg);
		break;
	default:
		error("Invalid checkpoint operation: %d", op);
		rc = EINVAL;
	}

	return rc;
}
예제 #24
0
파일: forward.c 프로젝트: IFCA/slurm
/*
 * start_msg_tree  - logic to begin the forward tree and
 *                   accumulate the return codes from processes getting the
 *                   the forwarded message
 *
 * IN: hl          - hostlist_t   - list of every node to send message to
 * IN: msg         - slurm_msg_t  - message to send.
 * IN: timeout     - int          - how long to wait in milliseconds.
 * RET List 	   - List containing the responses of the childern
 *		     (if any) we forwarded the message to. List
 *		     containing type (ret_data_info_t).
 */
extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout)
{
	int *span = NULL;
	fwd_tree_t *fwd_tree = NULL;
	pthread_mutex_t tree_mutex;
	pthread_cond_t notify;
	int j = 0, count = 0;
	List ret_list = NULL;
	char *name = NULL;
	int thr_count = 0;
	int host_count = 0;

	xassert(hl);
	xassert(msg);

	hostlist_uniq(hl);
	host_count = hostlist_count(hl);

	span = set_span(host_count, 0);

	slurm_mutex_init(&tree_mutex);
	pthread_cond_init(&notify, NULL);

	ret_list = list_create(destroy_data_info);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		fwd_tree->orig_msg = msg;
		fwd_tree->ret_list = ret_list;
		fwd_tree->timeout = timeout;
		fwd_tree->notify = &notify;
		fwd_tree->p_thr_count = &thr_count;
		fwd_tree->tree_mutex = &tree_mutex;

		if (fwd_tree->timeout <= 0) {
			/* convert secs to msec */
			fwd_tree->timeout  = slurm_get_msg_timeout() * 1000;
		}

		fwd_tree->tree_hl = hostlist_create(name);
		free(name);
		for (j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(fwd_tree->tree_hl, name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(&tree_mutex);
		thr_count++;
		slurm_mutex_unlock(&tree_mutex);

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);

	}
	xfree(span);

	slurm_mutex_lock(&tree_mutex);

	count = list_count(ret_list);
	debug2("Tree head got back %d looking for %d", count, host_count);
	while (thr_count > 0) {
		pthread_cond_wait(&notify, &tree_mutex);
		count = list_count(ret_list);
		debug2("Tree head got back %d", count);
	}
	xassert(count >= host_count);	/* Tree head did not get all responses,
					 * but no more active fwd threads!*/
	slurm_mutex_unlock(&tree_mutex);

	slurm_mutex_destroy(&tree_mutex);
	pthread_cond_destroy(&notify);

	return ret_list;
}
예제 #25
0
파일: backup.c 프로젝트: bingzhang/slurm
/* run_backup - this is the backup controller, it should run in standby
 *	mode, assuming control when the primary controller stops responding */
void run_backup(slurm_trigger_callbacks_t *callbacks)
{
	int i;
	uint32_t trigger_type;
	time_t last_ping = 0;
	pthread_attr_t thread_attr_sig, thread_attr_rpc;
	slurmctld_lock_t config_read_lock = {
		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	slurmctld_lock_t config_write_lock = {
		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };

	info("slurmctld running in background mode");
	takeover = false;
	last_controller_response = time(NULL);

	/* default: don't resume if shutdown */
	slurmctld_config.resume_backup = false;
	if (xsignal_block(backup_sigarray) < 0)
		error("Unable to block signals");

	/*
	 * create attached thread to process RPCs
	 */
	slurm_attr_init(&thread_attr_rpc);
	while (pthread_create(&slurmctld_config.thread_id_rpc,
			      &thread_attr_rpc, _background_rpc_mgr, NULL)) {
		error("pthread_create error %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_rpc);

	/*
	 * create attached thread for signal handling
	 */
	slurm_attr_init(&thread_attr_sig);
	while (pthread_create(&slurmctld_config.thread_id_sig,
			      &thread_attr_sig, _background_signal_hand,
			      NULL)) {
		error("pthread_create %m");
		sleep(1);
	}
	slurm_attr_destroy(&thread_attr_sig);
	trigger_type = TRIGGER_TYPE_BU_CTLD_RES_OP;
	_trigger_slurmctld_event(trigger_type);

	for (i = 0; ((i < 5) && (slurmctld_config.shutdown_time == 0)); i++) {
		sleep(1);       /* Give the primary slurmctld set-up time */
	}

	/* repeatedly ping ControlMachine */
	while (slurmctld_config.shutdown_time == 0) {
		sleep(1);
		/* Lock of slurmctld_conf below not important */
		if (slurmctld_conf.slurmctld_timeout &&
		    (takeover == false) &&
		    (difftime(time(NULL), last_ping) <
		     (slurmctld_conf.slurmctld_timeout / 3)))
			continue;

		last_ping = time(NULL);
		if (_ping_controller() == 0)
			last_controller_response = time(NULL);
		else if (takeover) {
			/* in takeover mode, take control as soon as */
			/* primary no longer respond */
			break;
		} else {
			uint32_t timeout;
			lock_slurmctld(config_read_lock);
			timeout = slurmctld_conf.slurmctld_timeout;
			unlock_slurmctld(config_read_lock);

			if (difftime(time(NULL), last_controller_response) >
			    timeout) {
				break;
			}
		}
	}

	if (slurmctld_config.shutdown_time != 0) {
		/* Since pidfile is created as user root (its owner is
		 *   changed to SlurmUser) SlurmUser may not be able to
		 *   remove it, so this is not necessarily an error.
		 * No longer need slurmctld_conf lock after above join. */
		if (unlink(slurmctld_conf.slurmctld_pidfile) < 0)
			verbose("Unable to remove pidfile '%s': %m",
				slurmctld_conf.slurmctld_pidfile);

		info("BackupController terminating");
		pthread_join(slurmctld_config.thread_id_sig, NULL);
		log_fini();
		if (dump_core)
			abort();
		else
			exit(0);
	}

	lock_slurmctld(config_read_lock);
	error("ControlMachine %s not responding, "
		"BackupController %s taking over",
		slurmctld_conf.control_machine,
		slurmctld_conf.backup_controller);
	unlock_slurmctld(config_read_lock);

	backup_slurmctld_restart();
	trigger_primary_ctld_fail();
	trigger_backup_ctld_as_ctrl();

	pthread_kill(slurmctld_config.thread_id_sig, SIGTERM);
	pthread_join(slurmctld_config.thread_id_sig, NULL);
	pthread_join(slurmctld_config.thread_id_rpc, NULL);

	/* The job list needs to be freed before we run
	 * ctld_assoc_mgr_init, it should be empty here in the first place.
	 */
	lock_slurmctld(config_write_lock);
	job_fini();
	init_job_conf();
	unlock_slurmctld(config_write_lock);

	ctld_assoc_mgr_init(callbacks);

	/* clear old state and read new state */
	lock_slurmctld(config_write_lock);
	if (switch_g_restore(slurmctld_conf.state_save_location, true)) {
		error("failed to restore switch state");
		abort();
	}
	if (read_slurm_conf(2, false)) {	/* Recover all state */
		error("Unable to recover slurm state");
		abort();
	}
	slurmctld_config.shutdown_time = (time_t) 0;
	unlock_slurmctld(config_write_lock);
	select_g_select_nodeinfo_set_all();

	return;
}
예제 #26
0
파일: agent.c 프로젝트: alepharchives/slurm
/*
 * agent - party responsible for transmitting an common RPC in parallel
 *	across a set of nodes. Use agent_queue_request() if immediate
 *	execution is not essential.
 * IN pointer to agent_arg_t, which is xfree'd (including hostlist,
 *	and msg_args) upon completion
 * RET always NULL (function format just for use as pthread)
 */
void *agent(void *args)
{
	int i, delay, rc, retries = 0;
	pthread_attr_t attr_wdog;
	pthread_t thread_wdog;
	agent_arg_t *agent_arg_ptr = args;
	agent_info_t *agent_info_ptr = NULL;
	thd_t *thread_ptr;
	task_info_t *task_specific_ptr;
	time_t begin_time;

#if 0
	info("Agent_cnt is %d of %d with msg_type %d",
	     agent_cnt, MAX_AGENT_CNT, agent_arg_ptr->msg_type);
#endif
	slurm_mutex_lock(&agent_cnt_mutex);
	if (!wiki2_sched_test) {
		char *sched_type = slurm_get_sched_type();
		if (strcmp(sched_type, "sched/wiki2") == 0)
			wiki2_sched = true;
		xfree(sched_type);
		wiki2_sched_test = true;
	}

	while (1) {
		if (slurmctld_config.shutdown_time ||
		    (agent_cnt < MAX_AGENT_CNT)) {
			agent_cnt++;
			break;
		} else {	/* wait for state change and retry */
			pthread_cond_wait(&agent_cnt_cond, &agent_cnt_mutex);
		}
	}
	slurm_mutex_unlock(&agent_cnt_mutex);
	if (slurmctld_config.shutdown_time)
		goto cleanup;

	/* basic argument value tests */
	begin_time = time(NULL);
	if (_valid_agent_arg(agent_arg_ptr))
		goto cleanup;

	/* initialize the agent data structures */
	agent_info_ptr = _make_agent_info(agent_arg_ptr);
	thread_ptr = agent_info_ptr->thread_struct;

	/* start the watchdog thread */
	slurm_attr_init(&attr_wdog);
	if (pthread_attr_setdetachstate
	    (&attr_wdog, PTHREAD_CREATE_JOINABLE))
		error("pthread_attr_setdetachstate error %m");
	while (pthread_create(&thread_wdog, &attr_wdog, _wdog,
				(void *) agent_info_ptr)) {
		error("pthread_create error %m");
		if (++retries > MAX_RETRIES)
			fatal("Can't create pthread");
		usleep(10000);	/* sleep and retry */
	}
	slurm_attr_destroy(&attr_wdog);
#if 	AGENT_THREAD_COUNT < 1
	fatal("AGENT_THREAD_COUNT value is invalid");
#endif
	debug2("got %d threads to send out",agent_info_ptr->thread_count);
	/* start all the other threads (up to AGENT_THREAD_COUNT active) */
	for (i = 0; i < agent_info_ptr->thread_count; i++) {

		/* wait until "room" for another thread */
		slurm_mutex_lock(&agent_info_ptr->thread_mutex);
		while (agent_info_ptr->threads_active >=
		       AGENT_THREAD_COUNT) {
			pthread_cond_wait(&agent_info_ptr->thread_cond,
					  &agent_info_ptr->thread_mutex);
		}

		/* create thread specific data, NOTE: freed from
		 *      _thread_per_group_rpc() */
		task_specific_ptr = _make_task_data(agent_info_ptr, i);

		slurm_attr_init(&thread_ptr[i].attr);
		if (pthread_attr_setdetachstate(&thread_ptr[i].attr,
						PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");
		while ((rc = pthread_create(&thread_ptr[i].thread,
					    &thread_ptr[i].attr,
					    _thread_per_group_rpc,
					    (void *) task_specific_ptr))) {
			error("pthread_create error %m");
			if (agent_info_ptr->threads_active)
				pthread_cond_wait(&agent_info_ptr->
						  thread_cond,
						  &agent_info_ptr->
						  thread_mutex);
			else {
				slurm_mutex_unlock(&agent_info_ptr->
						     thread_mutex);
				usleep(10000);	/* sleep and retry */
				slurm_mutex_lock(&agent_info_ptr->
						   thread_mutex);
			}
		}
		slurm_attr_destroy(&thread_ptr[i].attr);
		agent_info_ptr->threads_active++;
		slurm_mutex_unlock(&agent_info_ptr->thread_mutex);
	}

	/* wait for termination of remaining threads */
	pthread_join(thread_wdog, NULL);
	delay = (int) difftime(time(NULL), begin_time);
	if (delay > (slurm_get_msg_timeout() * 2)) {
		info("agent msg_type=%u ran for %d seconds",
			agent_arg_ptr->msg_type,  delay);
	}
	slurm_mutex_lock(&agent_info_ptr->thread_mutex);
	while (agent_info_ptr->threads_active != 0) {
		pthread_cond_wait(&agent_info_ptr->thread_cond,
				&agent_info_ptr->thread_mutex);
	}
	slurm_mutex_unlock(&agent_info_ptr->thread_mutex);

      cleanup:
	_purge_agent_args(agent_arg_ptr);

	if (agent_info_ptr) {
		xfree(agent_info_ptr->thread_struct);
		xfree(agent_info_ptr);
	}
	slurm_mutex_lock(&agent_cnt_mutex);

	if (agent_cnt > 0)
		agent_cnt--;
	else {
		error("agent_cnt underflow");
		agent_cnt = 0;
	}

	if (agent_cnt && agent_cnt < MAX_AGENT_CNT)
		agent_retry(RPC_RETRY_INTERVAL, true);

	pthread_cond_broadcast(&agent_cnt_cond);
	slurm_mutex_unlock(&agent_cnt_mutex);

	return NULL;
}
예제 #27
0
파일: forward.c 프로젝트: artpol84/slurm
static void _start_msg_tree_internal(hostlist_t hl, hostlist_t* sp_hl,
				     fwd_tree_t *fwd_tree_in,
				     int hl_count)
{
	int j;
	fwd_tree_t *fwd_tree;

	xassert((hl || sp_hl) && !(hl && sp_hl));
	xassert(fwd_tree_in);
	xassert(fwd_tree_in->p_thr_count);
	xassert(fwd_tree_in->tree_mutex);
	xassert(fwd_tree_in->notify);
	xassert(fwd_tree_in->ret_list);

	if (hl)
		xassert(hl_count == hostlist_count(hl));

	if (fwd_tree_in->timeout <= 0)
		/* convert secs to msec */
		fwd_tree_in->timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		memcpy(fwd_tree, fwd_tree_in, sizeof(fwd_tree_t));

		if (sp_hl) {
			fwd_tree->tree_hl = sp_hl[j];
			sp_hl[j] = NULL;
		} else if (hl) {
			char *name = hostlist_shift(hl);
			fwd_tree->tree_hl = hostlist_create(name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(fwd_tree->tree_mutex);
		(*fwd_tree->p_thr_count)++;
		slurm_mutex_unlock(fwd_tree->tree_mutex);

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(100000);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);

	}
}
예제 #28
0
extern void acct_gather_energy_p_conf_set(s_p_hashtbl_t *tbl)
{
	char *tmp_char;

	/* Set initial values */
	reset_slurm_ipmi_conf(&slurm_ipmi_conf);

	if (tbl) {
		/* ipmi initialisation parameters */
		s_p_get_uint32(&slurm_ipmi_conf.driver_type,
			       "EnergyIPMIDriverType", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.disable_auto_probe,
			       "EnergyIPMIDisableAutoProbe", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.driver_address,
			       "EnergyIPMIDriverAddress", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.register_spacing,
			       "EnergyIPMIRegisterSpacing", tbl);

		s_p_get_string(&slurm_ipmi_conf.driver_device,
			       "EnergyIPMIDriverDevice", tbl);

		s_p_get_uint32(&slurm_ipmi_conf.protocol_version,
			       "EnergyIPMIProtocolVersion", tbl);

		if (!s_p_get_string(&slurm_ipmi_conf.username,
				    "EnergyIPMIUsername", tbl))
			slurm_ipmi_conf.username = xstrdup(DEFAULT_IPMI_USER);

		s_p_get_string(&slurm_ipmi_conf.password,
			       "EnergyIPMIPassword", tbl);
		if (!slurm_ipmi_conf.password)
			slurm_ipmi_conf.password = xstrdup("foopassword");

		s_p_get_uint32(&slurm_ipmi_conf.privilege_level,
			       "EnergyIPMIPrivilegeLevel", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.authentication_type,
			       "EnergyIPMIAuthenticationType", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.cipher_suite_id,
			       "EnergyIPMICipherSuiteId", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.session_timeout,
			       "EnergyIPMISessionTimeout", tbl);
		s_p_get_uint32(&slurm_ipmi_conf.retransmission_timeout,
			       "EnergyIPMIRetransmissionTimeout", tbl);
		s_p_get_uint32(&slurm_ipmi_conf. workaround_flags,
			       "EnergyIPMIWorkaroundFlags", tbl);

		if (!s_p_get_boolean(&slurm_ipmi_conf.reread_sdr_cache,
				     "EnergyIPMIRereadSdrCache", tbl))
			slurm_ipmi_conf.reread_sdr_cache = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.
				     ignore_non_interpretable_sensors,
				     "EnergyIPMIIgnoreNonInterpretableSensors",
				     tbl))
			slurm_ipmi_conf.ignore_non_interpretable_sensors =
				false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.bridge_sensors,
				     "EnergyIPMIBridgeSensors", tbl))
			slurm_ipmi_conf.bridge_sensors = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.interpret_oem_data,
				     "EnergyIPMIInterpretOemData", tbl))
			slurm_ipmi_conf.interpret_oem_data = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.shared_sensors,
				     "EnergyIPMISharedSensors", tbl))
			slurm_ipmi_conf.shared_sensors = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.discrete_reading,
				     "EnergyIPMIDiscreteReading", tbl))
			slurm_ipmi_conf.discrete_reading = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.ignore_scanning_disabled,
				     "EnergyIPMIIgnoreScanningDisabled", tbl))
			slurm_ipmi_conf.ignore_scanning_disabled = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.assume_bmc_owner,
				     "EnergyIPMIAssumeBmcOwner", tbl))
			slurm_ipmi_conf.assume_bmc_owner = false;
		if (!s_p_get_boolean(&slurm_ipmi_conf.entity_sensor_names,
				     "EnergyIPMIEntitySensorNames", tbl))
			slurm_ipmi_conf.entity_sensor_names = false;

		s_p_get_uint32(&slurm_ipmi_conf.freq,
			       "EnergyIPMIFrequency", tbl);

		if ((int)slurm_ipmi_conf.freq <= 0)
			fatal("EnergyIPMIFrequency must be a positive integer "
			      "in acct_gather.conf.");

		if (!s_p_get_boolean(&(slurm_ipmi_conf.adjustment),
				     "EnergyIPMICalcAdjustment", tbl))
			slurm_ipmi_conf.adjustment = false;

		s_p_get_uint32(&slurm_ipmi_conf.power_sensor_num,
			       "EnergyIPMIPowerSensor", tbl);

		s_p_get_uint32(&slurm_ipmi_conf.timeout,
			       "EnergyIPMITimeout", tbl);

		if (s_p_get_string(&tmp_char, "EnergyIPMIVariable", tbl)) {
			if (!strcmp(tmp_char, "Temp"))
				slurm_ipmi_conf.variable =
					IPMI_MONITORING_SENSOR_TYPE_TEMPERATURE;
			xfree(tmp_char);
		}
	}

	if (!_run_in_daemon())
		return;

	if (!flag_init) {
		local_energy = acct_gather_energy_alloc();
		local_energy->consumed_energy=0;
		local_energy->base_consumed_energy=0;
		local_energy->base_watts=0;
		flag_init = true;
		if (_is_thread_launcher()) {
			pthread_attr_t attr;
			slurm_attr_init(&attr);
			if (pthread_create(&thread_ipmi_id_launcher, &attr,
					   &_thread_launcher, NULL)) {
				//if (pthread_create(... (void *)arg)) {
				debug("energy accounting failed to create "
				      "_thread_launcher thread: %m");
			}
			slurm_attr_destroy(&attr);
			if (debug_flags & DEBUG_FLAG_ENERGY)
				info("%s thread launched", plugin_name);
		} else
			_get_joules_task(0);
	}

	verbose("%s loaded", plugin_name);
}
예제 #29
0
파일: bg_core.c 프로젝트: fafik23/slurm
/* block_state_mutex should be unlocked before calling this */
extern void free_block_list(uint32_t job_id, List track_list,
			    bool destroy, bool wait)
{
	bg_record_t *bg_record = NULL;
	int retries;
	ListIterator itr = NULL;
	bg_free_block_list_t *bg_free_list;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;
	List kill_job_list = NULL;
	kill_job_struct_t *freeit;

	if (!track_list || !list_count(track_list))
		return;

	bg_free_list = xmalloc(sizeof(bg_free_block_list_t));
	bg_free_list->track_list = list_create(NULL);
	bg_free_list->destroy = destroy;
	bg_free_list->job_id = job_id;

	slurm_mutex_lock(&block_state_mutex);
	list_transfer(bg_free_list->track_list, track_list);
	itr = list_iterator_create(bg_free_list->track_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was already destroyed %p", bg_record);
			continue;
		}
		bg_record->free_cnt++;

		/* just so we don't over write a different thread that
		   wants this block destroyed */
		if (destroy && !bg_record->destroy)
			bg_record->destroy = destroy;

		if (destroy && (bg_record->state & BG_BLOCK_ERROR_FLAG))
			resume_block(bg_record);

		/* This means we are wanting this block free so we can
		   run this job on it, so it is ok to have the job
		   remain here.  Only checking for jobs should go
		   below this.
		*/
		if (bg_record->modifying) {
			debug("free_block_list: Just FYI, we are "
			      "freeing a block (%s) that "
			      "has at least one pending job.",
			      bg_record->bg_block_id);
			continue;
		}

		if (bg_record->job_ptr
		    && !IS_JOB_FINISHED(bg_record->job_ptr)) {
			info("We are freeing a block (%s) that "
			     "has job %u(%u).",
			     bg_record->bg_block_id,
			     bg_record->job_ptr->job_id,
			     bg_record->job_running);
			if (!kill_job_list)
				kill_job_list =
					bg_status_create_kill_job_list();
			freeit = xmalloc(sizeof(kill_job_struct_t));
			freeit->jobid = bg_record->job_ptr->job_id;
			list_push(kill_job_list, freeit);
		} else if (bg_record->job_list
			   && list_count(bg_record->job_list)) {
			struct job_record *job_ptr;
			ListIterator itr;

			if (!kill_job_list)
				kill_job_list =
					bg_status_create_kill_job_list();
			info("We are freeing a block (%s) that has at "
			     "least 1 job.",
			     bg_record->bg_block_id);
			itr = list_iterator_create(bg_record->job_list);
			while ((job_ptr = list_next(itr))) {
				if ((job_ptr->magic != JOB_MAGIC)
				    || IS_JOB_FINISHED(job_ptr))
					continue;
				freeit = xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = job_ptr->job_id;
				list_push(kill_job_list, freeit);
			}
			list_iterator_destroy(itr);
		}
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (kill_job_list) {
		bg_status_process_kill_job_list(kill_job_list, JOB_FAILED, 0);
		FREE_NULL_LIST(kill_job_list);
	}

	if (wait) {
		/* Track_freeing_blocks waits until the list is done
		   and frees the memory of bg_free_list.
		*/
		_track_freeing_blocks(bg_free_list);
		return;
	}

	/* _track_freeing_blocks handles cleanup */
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	retries = 0;
	while (pthread_create(&thread_agent, &attr_agent,
			      _track_freeing_blocks,
			      bg_free_list)) {
		error("pthread_create error %m");
		if (++retries > MAX_PTHREAD_RETRIES)
			fatal("Can't create pthread");
		/* sleep and retry */
		usleep(1000);
	}
	slurm_attr_destroy(&attr_agent);
	return;
}
예제 #30
0
파일: forward.c 프로젝트: Poshi/slurm
/*
 * start_msg_tree  - logic to begin the forward tree and
 *                   accumulate the return codes from processes getting the
 *                   the forwarded message
 *
 * IN: hl          - hostlist_t   - list of every node to send message to
 * IN: msg         - slurm_msg_t  - message to send.
 * IN: timeout     - int          - how long to wait in milliseconds.
 * RET List 	   - List containing the responses of the childern
 *		     (if any) we forwarded the message to. List
 *		     containing type (ret_data_info_t).
 */
extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout)
{
	int *span = NULL;
	fwd_tree_t *fwd_tree = NULL;
	pthread_mutex_t tree_mutex;
	pthread_cond_t notify;
	int j = 0, count = 0;
	List ret_list = NULL;
	char *name = NULL;
	int thr_count = 0;
	int host_count = 0;

	xassert(hl);
	xassert(msg);

	hostlist_uniq(hl);
	host_count = hostlist_count(hl);

	span = set_span(host_count, 0);

	slurm_mutex_init(&tree_mutex);
	pthread_cond_init(&notify, NULL);

	ret_list = list_create(destroy_data_info);

	while ((name = hostlist_shift(hl))) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		fwd_tree->orig_msg = msg;
		fwd_tree->ret_list = ret_list;
		fwd_tree->timeout = timeout;
		fwd_tree->notify = &notify;
		fwd_tree->tree_mutex = &tree_mutex;

		if(fwd_tree->timeout <= 0) {
			/* convert secs to msec */
			fwd_tree->timeout  = slurm_get_msg_timeout() * 1000;
		}

		fwd_tree->tree_hl = hostlist_create(name);
		free(name);
		for (j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(hl);
			if (!name)
				break;
			hostlist_push(fwd_tree->tree_hl, name);
			free(name);
		}

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			sleep(1);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
		thr_count++;
	}
	xfree(span);

	slurm_mutex_lock(&tree_mutex);

	count = list_count(ret_list);
	debug2("Tree head got back %d looking for %d", count, host_count);
	while ((count < host_count)) {
		pthread_cond_wait(&notify, &tree_mutex);
		count = list_count(ret_list);
		debug2("Tree head got back %d", count);
	}
	debug2("Tree head got them all");
	slurm_mutex_unlock(&tree_mutex);

	slurm_mutex_destroy(&tree_mutex);
	pthread_cond_destroy(&notify);

	return ret_list;
}