extern int acct_gather_interconnect_startpoll(uint32_t frequency)
{
	int retval = SLURM_SUCCESS;

	if (acct_gather_interconnect_init() < 0)
		return SLURM_ERROR;

	if (!acct_shutdown) {
		error("%s: poll already started!", __func__);
		return retval;
	}

	acct_shutdown = false;

	freq = frequency;

	if (frequency == 0) {   /* don't want dynamic monitoring? */
		debug2("%s: dynamic logging disabled", __func__);
		return retval;
	}

	/* create polling thread */
	slurm_thread_create(&watch_node_thread_id, &_watch_node, NULL);

	debug3("%s: dynamic logging enabled", __func__);

	return retval;
}
예제 #2
0
파일: msg_aggr.c 프로젝트: supermanue/slurm
extern void msg_aggr_sender_init(char *host, uint16_t port, uint64_t window,
				 uint64_t max_msg_cnt)
{
	if (msg_collection.running || (max_msg_cnt <= 1))
		return;

	memset(&msg_collection, 0, sizeof(msg_collection_type_t));

	slurm_mutex_init(&msg_collection.aggr_mutex);
	slurm_mutex_init(&msg_collection.mutex);

	slurm_mutex_lock(&msg_collection.mutex);
	slurm_mutex_lock(&msg_collection.aggr_mutex);
	slurm_cond_init(&msg_collection.cond, NULL);
	slurm_set_addr(&msg_collection.node_addr, port, host);
	msg_collection.window = window;
	msg_collection.max_msg_cnt = max_msg_cnt;
	msg_collection.msg_aggr_list = list_create(_msg_aggr_free);
	msg_collection.msg_list = list_create(slurm_free_comp_msg_list);
	msg_collection.max_msgs = false;
	msg_collection.debug_flags = slurm_get_debug_flags();
	slurm_mutex_unlock(&msg_collection.aggr_mutex);
	slurm_mutex_unlock(&msg_collection.mutex);

	slurm_thread_create(&msg_collection.thread_id,
			    &_msg_aggregation_sender, NULL);
}
예제 #3
0
extern void slurm_persist_conn_recv_thread_init(slurm_persist_conn_t *persist_conn,
						int thread_loc, void *arg)
{
	persist_service_conn_t *service_conn;

	if (thread_loc < 0)
		thread_loc = slurm_persist_conn_wait_for_thread_loc();
	if (thread_loc < 0)
		return;

	service_conn = xmalloc(sizeof(persist_service_conn_t));

	slurm_mutex_lock(&thread_count_lock);
	persist_service_conn[thread_loc] = service_conn;
	slurm_mutex_unlock(&thread_count_lock);

	service_conn->arg = arg;
	service_conn->conn = persist_conn;
	service_conn->thread_loc = thread_loc;

	persist_conn->timeout = 0; /* If this isn't zero we won't wait forever
				      like we want to.
				   */

	//_service_connection(service_conn);
	slurm_thread_create(&persist_service_conn[thread_loc]->thread_id,
			    _service_connection, service_conn);
}
예제 #4
0
/*
 * init() is called when the plugin is loaded, before any other functions
 * are called. Put global initialization here.
 */
extern int init(void)
{
	jobslist = list_create(_jobslist_del);
	slurm_thread_create(&job_handler_thread, _process_jobs, NULL);
	slurm_mutex_lock(&pend_jobs_lock);
	(void) _load_pending_jobs();
	slurm_mutex_unlock(&pend_jobs_lock);

	return SLURM_SUCCESS;
}
예제 #5
0
extern int proctrack_p_create(stepd_step_rec_t *job)
{
	DEF_TIMERS;
	START_TIMER;

	if (!libjob_handle)
		init();

	if (!job->cont_id) {
		/* Since the cray job lib will create the container
		   off the process calling job_create we don't want to
		   call it from the main process since it will include
		   all the threads the main process spawns and there
		   is no way to safely track which pids need to be
		   removed when removing the parent.  It turns out
		   spawning a thread will make the job_create create
		   the container off that process instead of the main
		   process.  Once we have added a process we can end
		   the thread which will remove the pid from the
		   container automatically.  Empty containers are not
		   valid.
		*/
		slurm_mutex_lock(&thread_mutex);
		if (threadid) {
			debug("Had a thread already 0x%08lx", threadid);
			slurm_mutex_lock(&notify_mutex);
			slurm_cond_wait(&notify, &notify_mutex);
			slurm_mutex_unlock(&notify_mutex);
			debug("Last thread done 0x%08lx", threadid);
		}
		/* We have to lock the notify_mutex here since the
		   thread could possibly signal things before we
		   started waiting for it.

		*/
		slurm_mutex_lock(&notify_mutex);
		slurm_thread_create(&threadid, _create_container_thread, job);
		slurm_cond_wait(&notify, &notify_mutex);
		slurm_mutex_unlock(&notify_mutex);
		slurm_mutex_unlock(&thread_mutex);
		if (job->cont_id != (jid_t)-1)
			debug("proctrack_p_create: created jid "
			      "0x%08lx thread 0x%08lx",
			      job->cont_id, threadid);
	} else
		error("proctrack_p_create: already have a cont_id");

	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);

	return SLURM_SUCCESS;
}
예제 #6
0
파일: gang.c 프로젝트: supermanue/slurm
static void _spawn_timeslicer_thread(void)
{
	slurm_mutex_lock( &thread_flag_mutex );
	if (thread_running) {
		error("timeslicer thread already running, not starting "
		      "another");
		slurm_mutex_unlock(&thread_flag_mutex);
		return;
	}

	slurm_thread_create(&timeslicer_thread_id, _timeslicer_thread, NULL);
	thread_running = true;
	slurm_mutex_unlock(&thread_flag_mutex);
}
예제 #7
0
파일: slurmdbd_agent.c 프로젝트: mej/slurm
static void _create_agent(void)
{
	/* this needs to be set because the agent thread will do
	   nothing if the connection was closed and then opened again */
	slurmdbd_shutdown = 0;

	if (agent_list == NULL) {
		agent_list = list_create(slurmdbd_free_buffer);
		_load_dbd_state();
	}

	if (agent_tid == 0) {
		slurm_thread_create(&agent_tid, _agent, NULL);
	}
}
예제 #8
0
int init( void )
{
	if (slurmctld_config.scheduling_disabled)
		return SLURM_SUCCESS;

	verbose( "sched: Backfill scheduler plugin loaded" );

	slurm_mutex_lock( &thread_flag_mutex );
	if ( backfill_thread ) {
		debug2( "Backfill thread already running, not starting "
			"another" );
		slurm_mutex_unlock( &thread_flag_mutex );
		return SLURM_ERROR;
	}

	/* since we do a join on this later we don't make it detached */
	slurm_thread_create(&backfill_thread, backfill_agent, NULL);

	slurm_mutex_unlock( &thread_flag_mutex );

	return SLURM_SUCCESS;
}
예제 #9
0
static void *_thread_launcher(void *no_data)
{
	//what arg would countain? frequency, socket?
	struct timeval tvnow;
	struct timespec abs;

	slurm_thread_create(&thread_ipmi_id_run, _thread_ipmi_run, NULL);

	/* setup timer */
	gettimeofday(&tvnow, NULL);
	abs.tv_sec = tvnow.tv_sec + slurm_ipmi_conf.timeout;
	abs.tv_nsec = tvnow.tv_usec * 1000;

	slurm_mutex_lock(&launch_mutex);
	slurm_cond_timedwait(&launch_cond, &launch_mutex, &abs);
	slurm_mutex_unlock(&launch_mutex);

	if (!flag_thread_started) {
		error("%s threads failed to start in a timely manner",
		      plugin_name);

		flag_energy_accounting_shutdown = true;

		/*
		 * It is a known thing we can hang up on IPMI calls cancel if
		 * we must.
		 */
		pthread_cancel(thread_ipmi_id_run);

		/*
		 * Unlock just to make sure since we could have canceled the
		 * thread while in the lock.
		 */
		slurm_mutex_unlock(&ipmi_mutex);
	}

	return NULL;
}
예제 #10
0
static int _fed_job_will_run(job_desc_msg_t *req,
			     will_run_response_msg_t **will_run_resp,
			     slurmdb_federation_rec_t *fed)
{
	List resp_msg_list;
	int pthread_count = 0, i;
	pthread_t *load_thread = 0;
	load_willrun_req_struct_t *load_args;
	ListIterator iter;
	will_run_response_msg_t *earliest_resp = NULL;
	load_willrun_resp_struct_t *tmp_resp;
	slurmdb_cluster_rec_t *cluster;
	List req_clusters = NULL;

	xassert(req);
	xassert(will_run_resp);

	*will_run_resp = NULL;

	/*
	 * If a subset of clusters was specified then only do a will_run to
	 * those clusters, otherwise check all clusters in the federation.
	 */
	if (req->clusters && xstrcasecmp(req->clusters, "all")) {
		req_clusters = list_create(slurm_destroy_char);
		slurm_addto_char_list(req_clusters, req->clusters);
	}

	/* Spawn one pthread per cluster to collect job information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *)list_next(iter))) {
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		if (req_clusters &&
		    !list_find_first(req_clusters, slurm_find_char_in_list,
				     cluster->name))
			continue;

		load_args = xmalloc(sizeof(load_willrun_req_struct_t));
		load_args->cluster       = cluster;
		load_args->req           = req;
		load_args->resp_msg_list = resp_msg_list;
		slurm_thread_create(&load_thread[pthread_count],
				    _load_willrun_thread, load_args);
		pthread_count++;
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(req_clusters);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	iter = list_iterator_create(resp_msg_list);
	while ((tmp_resp = (load_willrun_resp_struct_t *)list_next(iter))) {
		if (!tmp_resp->willrun_resp_msg)
			slurm_seterrno(tmp_resp->rc);
		else if ((!earliest_resp) ||
			 (tmp_resp->willrun_resp_msg->start_time <
			  earliest_resp->start_time)) {
			slurm_free_will_run_response_msg(earliest_resp);
			earliest_resp = tmp_resp->willrun_resp_msg;
			tmp_resp->willrun_resp_msg = NULL;
		}

		slurm_free_will_run_response_msg(tmp_resp->willrun_resp_msg);
		xfree(tmp_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	*will_run_resp = earliest_resp;

	if (!earliest_resp)
		return SLURM_ERROR;

	return SLURM_SUCCESS;
}
예제 #11
0
파일: node_info.c 프로젝트: SchedMD/slurm
static int _load_fed_nodes(slurm_msg_t *req_msg,
			   node_info_msg_t **node_info_msg_pptr,
			   uint16_t show_flags, char *cluster_name,
			   slurmdb_federation_rec_t *fed)
{
	int cluster_inx = 0, i;
	load_node_resp_struct_t *node_resp;
	node_info_msg_t *orig_msg = NULL, *new_msg = NULL;
	uint32_t new_rec_cnt;
	slurmdb_cluster_rec_t *cluster;
	ListIterator iter;
	int pthread_count = 0;
	pthread_t *load_thread = 0;
	load_node_req_struct_t *load_args;
	List resp_msg_list;

	*node_info_msg_pptr = NULL;

	/* Spawn one pthread per cluster to collect node information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) {
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		load_args = xmalloc(sizeof(load_node_req_struct_t));
		load_args->cluster = cluster;
		load_args->cluster_inx = cluster_inx++;
		load_args->req_msg = req_msg;
		load_args->resp_msg_list = resp_msg_list;
		load_args->show_flags = show_flags;
		slurm_thread_create(&load_thread[pthread_count],
				    _load_node_thread, load_args);
		pthread_count++;
	}
	list_iterator_destroy(iter);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	/* Maintain a consistent cluster/node ordering */
	list_sort(resp_msg_list, _sort_by_cluster_inx);

	/* Merge the responses into a single response message */
	iter = list_iterator_create(resp_msg_list);
	while ((node_resp = (load_node_resp_struct_t *) list_next(iter))) {
		new_msg = node_resp->new_msg;
		if (!orig_msg) {
			orig_msg = new_msg;
			*node_info_msg_pptr = orig_msg;
		} else {
			/* Merge the node records */
			orig_msg->last_update = MIN(orig_msg->last_update,
						    new_msg->last_update);
			new_rec_cnt = orig_msg->record_count +
				      new_msg->record_count;
			if (new_msg->record_count) {
				orig_msg->node_array =
					xrealloc(orig_msg->node_array,
						 sizeof(node_info_t) *
						 new_rec_cnt);
				(void) memcpy(orig_msg->node_array +
					      orig_msg->record_count,
					      new_msg->node_array,
					      sizeof(node_info_t) *
					      new_msg->record_count);
				orig_msg->record_count = new_rec_cnt;
			}
			xfree(new_msg->node_array);
			xfree(new_msg);
		}
		xfree(node_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	if (!orig_msg)
		slurm_seterrno_ret(SLURM_ERROR);

	return SLURM_SUCCESS;
}