Exemple #1
0
extern int proctrack_p_create(stepd_step_rec_t *job)
{
	pthread_attr_t attr;
	DEF_TIMERS;
	START_TIMER;

	if (!libjob_handle)
		init();

	if (!job->cont_id) {
		/* Since the cray job lib will create the container
		   off the process calling job_create we don't want to
		   call it from the main process since it will include
		   all the threads the main process spawns and there
		   is no way to safely track which pids need to be
		   removed when removing the parent.  It turns out
		   spawning a thread will make the job_create create
		   the container off that process instead of the main
		   process.  Once we have added a process we can end
		   the thread which will remove the pid from the
		   container automatically.  Empty containers are not
		   valid.
		*/
		slurm_mutex_lock(&thread_mutex);
		if (threadid) {
			debug("Had a thread already 0x%08lx", threadid);
			slurm_mutex_lock(&notify_mutex);
			slurm_cond_wait(&notify, &notify_mutex);
			slurm_mutex_unlock(&notify_mutex);
			debug("Last thread done 0x%08lx", threadid);
		}
		/* We have to lock the notify_mutex here since the
		   thread could possibly signal things before we
		   started waiting for it.

		*/
		slurm_mutex_lock(&notify_mutex);
		pthread_attr_init(&attr);
		pthread_create(&threadid, &attr, _create_container_thread, job);
		slurm_cond_wait(&notify, &notify_mutex);
		slurm_mutex_unlock(&notify_mutex);
		slurm_mutex_unlock(&thread_mutex);
		if (job->cont_id != (jid_t)-1)
			debug("proctrack_p_create: created jid "
			      "0x%08lx thread 0x%08lx",
			      job->cont_id, threadid);
	} else
		error("proctrack_p_create: already have a cont_id");

	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);

	return SLURM_SUCCESS;
}
Exemple #2
0
static int _signal_job_by_str(void)
{
	job_cancel_info_t *cancel_info;
	int err, i, rc = 0;
	pthread_t dummy;

	slurm_attr_init(&attr);
	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");
	slurm_mutex_init(&num_active_threads_lock);
	slurm_cond_init(&num_active_threads_cond, NULL);

	for (i = 0; opt.job_list[i]; i++) {
		cancel_info = (job_cancel_info_t *)
			xmalloc(sizeof(job_cancel_info_t));
		cancel_info->job_id_str = xstrdup(opt.job_list[i]);
		cancel_info->rc      = &rc;
		cancel_info->sig     = opt.signal;
		cancel_info->num_active_threads = &num_active_threads;
		cancel_info->num_active_threads_lock =
			&num_active_threads_lock;
		cancel_info->num_active_threads_cond =
			&num_active_threads_cond;

		slurm_mutex_lock(&num_active_threads_lock);
		num_active_threads++;
		while (num_active_threads > MAX_THREADS) {
			slurm_cond_wait(&num_active_threads_cond,
					&num_active_threads_lock);
		}
		slurm_mutex_unlock(&num_active_threads_lock);

		err = pthread_create(&dummy, &attr, _cancel_job_id,cancel_info);
		if (err)	/* Run in-line if thread create fails */
			_cancel_job_id(cancel_info);
	}

	/* Wait all spawned threads to finish */
	slurm_mutex_lock( &num_active_threads_lock );
	while (num_active_threads > 0) {
		slurm_cond_wait(&num_active_threads_cond,
				&num_active_threads_lock);
	}
	slurm_mutex_unlock(&num_active_threads_lock);

	slurm_attr_destroy(&attr);

	return rc;
}
static void *_watch_node(void *arg)
{
	int type = PROFILE_NETWORK;

#if HAVE_SYS_PRCTL_H
	if (prctl(PR_SET_NAME, "acctg_ib", NULL, NULL, NULL) < 0) {
		error("%s: cannot set my name to %s %m", __func__, "acctg_ib");
	}
#endif

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	while (init_run && acct_gather_profile_test()) {
		/* Do this until shutdown is requested */
		slurm_mutex_lock(&g_context_lock);
		(*(ops.node_update))();
		slurm_mutex_unlock(&g_context_lock);

		slurm_mutex_lock(&acct_gather_profile_timer[type].notify_mutex);
		slurm_cond_wait(
			&acct_gather_profile_timer[type].notify,
			&acct_gather_profile_timer[type].notify_mutex);
		slurm_mutex_unlock(&acct_gather_profile_timer[type].
				   notify_mutex);
	}

	return NULL;
}
Exemple #4
0
static void *_create_container_thread(void *args)
{
	stepd_step_rec_t *job = (stepd_step_rec_t *)args;

	job->cont_id = (uint64_t)job_create(0, job->uid, 0);

	/* Signal the container_create we are done */
	slurm_mutex_lock(&notify_mutex);

	/* We need to signal failure or not */
	slurm_cond_signal(&notify);

	/* Don't unlock the notify_mutex here, wait, it is not needed
	 * and can cause deadlock if done. */

	if (job->cont_id == (jid_t)-1)
		error("Failed to create job container: %m");
	else
		/* Wait around for something else to be added and then exit
		   when that takes place.
		*/
		slurm_cond_wait(&notify, &notify_mutex);

	slurm_mutex_unlock(&notify_mutex);

	return NULL;
}
Exemple #5
0
/*
 * Thread function that executes a script
 */
static void * _script_agent (void *args)
{
	while (1) {
		struct jobcomp_info *job;

		slurm_mutex_lock(&comp_list_mutex);

		if (list_is_empty(comp_list) && !agent_exit)
			slurm_cond_wait(&comp_list_cond, &comp_list_mutex);

		/*
		 * It is safe to unlock list mutex here. List has its
		 *  own internal mutex that protects the comp_list itself
		 */
		slurm_mutex_unlock(&comp_list_mutex);

		if ((job = list_pop(comp_list))) {
			_jobcomp_exec_child (script, job);
			_jobcomp_info_destroy (job);
		}

		/*
		 *  Exit if flag is set and we have no more entries to log
		 */
		if (agent_exit && list_is_empty (comp_list))
			break;
	}

	return NULL;
}
static void *_watch_node(void *arg)
{
	int i;

#if HAVE_SYS_PRCTL_H
	if (prctl(PR_SET_NAME, "acctg_intrcnt", NULL, NULL, NULL) < 0) {
		error("%s: cannot set my name to %s %m", __func__, "acctg_ib");
	}
#endif

	while (init_run && acct_gather_profile_test()) {
		/* Do this until shutdown is requested */
		slurm_mutex_lock(&g_context_lock);
		for (i = 0; i < g_context_num; i++) {
			if (!g_context[i])
				continue;
			(*(ops[i].node_update))();
		}
		slurm_mutex_unlock(&g_context_lock);

		slurm_mutex_lock(&profile_timer->notify_mutex);
		slurm_cond_wait(&profile_timer->notify,
				&profile_timer->notify_mutex);
		slurm_mutex_unlock(&profile_timer->notify_mutex);
	}

	return NULL;
}
/* _wr_wrlock - Issue a write lock on the specified data type */
static bool _wr_wrlock(lock_datatype_t datatype, bool wait_lock)
{
	bool success = true;

	slurm_mutex_lock(&locks_mutex);
	slurmctld_locks.entity[write_wait_lock(datatype)]++;

	while (1) {
		if ((slurmctld_locks.entity[read_lock(datatype)] == 0) &&
		    (slurmctld_locks.entity[write_lock(datatype)] == 0)) {
			slurmctld_locks.entity[write_lock(datatype)]++;
			slurmctld_locks.entity[write_wait_lock(datatype)]--;
			slurmctld_locks.entity[write_cnt_lock(datatype)]++;
			break;
		} else if (!wait_lock) {
			slurmctld_locks.entity[write_wait_lock(datatype)]--;
			success = false;
			break;
		} else {	/* wait for state change and retry */
			slurm_cond_wait(&locks_cond, &locks_mutex);
			if (kill_thread)
				pthread_exit(NULL);
		}
	}
	slurm_mutex_unlock(&locks_mutex);
	return success;
}
Exemple #8
0
extern void forward_wait(slurm_msg_t * msg)
{
	int count = 0;

	/* wait for all the other messages on the tree under us */
	if (msg->forward_struct) {
		debug2("looking for %d", msg->forward_struct->fwd_cnt);
		slurm_mutex_lock(&msg->forward_struct->forward_mutex);
		count = 0;
		if (msg->ret_list != NULL)
			count = list_count(msg->ret_list);

		debug2("Got back %d", count);
		while ((count < msg->forward_struct->fwd_cnt)) {
			slurm_cond_wait(&msg->forward_struct->notify,
					&msg->forward_struct->forward_mutex);

			if (msg->ret_list != NULL) {
				count = list_count(msg->ret_list);
			}
			debug2("Got back %d", count);
		}
		debug2("Got them all");
		slurm_mutex_unlock(&msg->forward_struct->forward_mutex);
		destroy_forward_struct(msg->forward_struct);
		msg->forward_struct = NULL;
	}
	return;
}
Exemple #9
0
/*
 * start_msg_tree  - logic to begin the forward tree and
 *                   accumulate the return codes from processes getting the
 *                   the forwarded message
 *
 * IN: hl          - hostlist_t   - list of every node to send message to
 * IN: msg         - slurm_msg_t  - message to send.
 * IN: timeout     - int          - how long to wait in milliseconds.
 * RET List 	   - List containing the responses of the children
 *		     (if any) we forwarded the message to. List
 *		     containing type (ret_data_info_t).
 */
extern List start_msg_tree(hostlist_t hl, slurm_msg_t *msg, int timeout)
{
	fwd_tree_t fwd_tree;
	pthread_mutex_t tree_mutex;
	pthread_cond_t notify;
	int count = 0;
	List ret_list = NULL;
	int thr_count = 0;
	int host_count = 0;
	hostlist_t* sp_hl;
	int hl_count = 0;

	xassert(hl);
	xassert(msg);

	hostlist_uniq(hl);
	host_count = hostlist_count(hl);

	if (route_g_split_hostlist(hl, &sp_hl, &hl_count,
				   msg->forward.tree_width)) {
		error("unable to split forward hostlist");
		return NULL;
	}
	slurm_mutex_init(&tree_mutex);
	slurm_cond_init(&notify, NULL);

	ret_list = list_create(destroy_data_info);

	memset(&fwd_tree, 0, sizeof(fwd_tree));
	fwd_tree.orig_msg = msg;
	fwd_tree.ret_list = ret_list;
	fwd_tree.timeout = timeout;
	fwd_tree.notify = &notify;
	fwd_tree.p_thr_count = &thr_count;
	fwd_tree.tree_mutex = &tree_mutex;

	_start_msg_tree_internal(NULL, sp_hl, &fwd_tree, hl_count);

	xfree(sp_hl);

	slurm_mutex_lock(&tree_mutex);

	count = list_count(ret_list);
	debug2("Tree head got back %d looking for %d", count, host_count);
	while (thr_count > 0) {
		slurm_cond_wait(&notify, &tree_mutex);
		count = list_count(ret_list);
		debug2("Tree head got back %d", count);
	}
	xassert(count >= host_count);	/* Tree head did not get all responses,
					 * but no more active fwd threads!*/
	slurm_mutex_unlock(&tree_mutex);

	slurm_mutex_destroy(&tree_mutex);
	slurm_cond_destroy(&notify);

	return ret_list;
}
Exemple #10
0
/* Wait for the job to completely start before trying to suspend it. */
static void _wait_for_job_init(stepd_step_rec_t *job)
{
	slurm_mutex_lock(&job->state_mutex);
	while (1) {
		if (job->state != SLURMSTEPD_STEP_STARTING) {
			slurm_mutex_unlock(&job->state_mutex);
			break;
		}
		slurm_cond_wait(&job->state_cond, &job->state_mutex);
	}
}
Exemple #11
0
static void *_launch_one_app(void *data)
{
	static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER;
	static pthread_cond_t  launch_cond  = PTHREAD_COND_INITIALIZER;
	static bool            launch_begin = false;
	static bool            launch_fini  = false;
	_launch_app_data_t *opts = (_launch_app_data_t *) data;
	opt_t *opt_local = opts->opt_local;
	srun_job_t *job  = opts->job;
	bool got_alloc   = opts->got_alloc;
	slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER;
	slurm_step_launch_callbacks_t step_callbacks;

	memset(&step_callbacks, 0, sizeof(step_callbacks));
	step_callbacks.step_signal = launch_g_fwd_signal;

	/*
	 * Run pre-launch once for entire pack job
	 */
	slurm_mutex_lock(&launch_mutex);
	if (!launch_begin) {
		launch_begin = true;
		slurm_mutex_unlock(&launch_mutex);

		pre_launch_srun_job(job, 0, 1, opt_local);

		slurm_mutex_lock(&launch_mutex);
		launch_fini = true;
		slurm_cond_broadcast(&launch_cond);
	} else {
		while (!launch_fini)
			slurm_cond_wait(&launch_cond, &launch_mutex);
	}
	slurm_mutex_unlock(&launch_mutex);

relaunch:
	launch_common_set_stdio_fds(job, &cio_fds, opt_local);

	if (!launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks,
				  opt_local)) {
		if (launch_g_step_wait(job, got_alloc, opt_local) == -1)
			goto relaunch;
	}

	if (opts->step_mutex) {
		slurm_mutex_lock(opts->step_mutex);
		(*opts->step_cnt)--;
		slurm_cond_broadcast(opts->step_cond);
		slurm_mutex_unlock(opts->step_mutex);
	}
	xfree(data);
	return NULL;
}
Exemple #12
0
/* Report if node power saving is enabled */
extern bool power_save_test(void)
{
	bool rc;

	slurm_mutex_lock(&power_mutex);
	while (!power_save_config) {
		slurm_cond_wait(&power_cond, &power_mutex);
	}
	rc = power_save_enabled;
	slurm_mutex_unlock(&power_mutex);

	return rc;
}
Exemple #13
0
/* _cancel_jobs - filter then cancel jobs or job steps per request */
static int _cancel_jobs(int filter_cnt)
{
	int rc = 0;

	slurm_attr_init(&attr);
	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))
		error("pthread_attr_setdetachstate error %m");

	slurm_mutex_init(&num_active_threads_lock);
	slurm_cond_init(&num_active_threads_cond, NULL);

	_cancel_jobs_by_state(JOB_PENDING, filter_cnt, &rc);
	/* Wait for any cancel of pending jobs to complete before starting
	 * cancellation of running jobs so that we don't have a race condition
	 * with pending jobs getting scheduled while running jobs are also
	 * being cancelled. */
	slurm_mutex_lock( &num_active_threads_lock );
	while (num_active_threads > 0) {
		slurm_cond_wait(&num_active_threads_cond,
				&num_active_threads_lock);
	}
	slurm_mutex_unlock(&num_active_threads_lock);

	_cancel_jobs_by_state(JOB_END, filter_cnt, &rc);
	/* Wait for any spawned threads that have not finished */
	slurm_mutex_lock( &num_active_threads_lock );
	while (num_active_threads > 0) {
		slurm_cond_wait(&num_active_threads_cond,
				&num_active_threads_lock);
	}
	slurm_mutex_unlock(&num_active_threads_lock);

	slurm_attr_destroy(&attr);
	slurm_mutex_destroy(&num_active_threads_lock);
	slurm_cond_destroy(&num_active_threads_cond);

	return rc;
}
Exemple #14
0
/* Increment thread_count and don't return until its value is no larger
 *	than MAX_THREAD_COUNT,
 * RET index of free index in persist_service_conn or -1 to exit */
extern int slurm_persist_conn_wait_for_thread_loc(void)
{
	bool print_it = true;
	int i, rc = -1;

	slurm_mutex_lock(&thread_count_lock);
	while (1) {
		if (shutdown_time)
			break;

		if (thread_count < MAX_THREAD_COUNT) {
			thread_count++;
			for (i=0; i<MAX_THREAD_COUNT; i++) {
				if (persist_service_conn[i])
					continue;
				rc = i;
				break;
			}
			if (rc == -1) {
				/* thread_count and persist_thread_id
				 * out of sync */
				fatal("No free persist_thread_id");
			}
			break;
		} else {
			/* wait for state change and retry,
			 * just a delay and not an error.
			 * This can happen when the epilog completes
			 * on a bunch of nodes at the same time, which
			 * can easily happen for highly parallel jobs. */
			if (print_it) {
				static time_t last_print_time = 0;
				time_t now = time(NULL);
				if (difftime(now, last_print_time) > 2) {
					verbose("thread_count over "
						"limit (%d), waiting",
						thread_count);
					last_print_time = now;
				}
				print_it = false;
			}
			slurm_cond_wait(&thread_count_cond, &thread_count_lock);
		}
	}
	slurm_mutex_unlock(&thread_count_lock);
	return rc;
}
Exemple #15
0
/* _wr_rdlock - Issue a read lock on the specified data type
 *	Wait until there are no write locks AND
 *	no pending write locks (write_wait_lock == 0)
 *
 *	NOTE: Always favoring write locks can result in starvation for
 *	read locks. To prevent this, read locks were permitted to be satisified
 *	after 10 consecutive write locks. This prevented starvation, but
 *	deadlock has been observed with some values for the count. */
static bool _wr_rdlock(lock_datatype_t datatype, bool wait_lock)
{
	bool success = true;

	slurm_mutex_lock(&locks_mutex);
	while (1) {
#if 1
		if ((slurmctld_locks.entity[write_lock(datatype)] == 0) &&
		    (slurmctld_locks.entity[write_wait_lock(datatype)] == 0)) {
#else
		/* SEE NOTE ABOVE */
		if ((slurmctld_locks.entity[write_lock(datatype)] == 0) &&
		    ((slurmctld_locks.entity[write_wait_lock(datatype)] == 0) ||
		     (slurmctld_locks.entity[write_cnt_lock(datatype)] > 10))) {
#endif
			slurmctld_locks.entity[read_lock(datatype)]++;
			slurmctld_locks.entity[write_cnt_lock(datatype)] = 0;
			break;
		} else if (!wait_lock) {
			success = false;
			break;
		} else {	/* wait for state change and retry */
			slurm_cond_wait(&locks_cond, &locks_mutex);
			if (kill_thread)
				pthread_exit(NULL);
		}
	}
	slurm_mutex_unlock(&locks_mutex);
	return success;
}

/* _wr_rdunlock - Issue a read unlock on the specified data type */
static void _wr_rdunlock(lock_datatype_t datatype)
{
	slurm_mutex_lock(&locks_mutex);
	slurmctld_locks.entity[read_lock(datatype)]--;
	slurm_cond_broadcast(&locks_cond);
	slurm_mutex_unlock(&locks_mutex);
}
Exemple #16
0
static void *_watch_tasks(void *arg)
{
	int type = PROFILE_TASK;

#if HAVE_SYS_PRCTL_H
	if (prctl(PR_SET_NAME, "acctg", NULL, NULL, NULL) < 0) {
		error("%s: cannot set my name to %s %m", __func__, "acctg");
	}
#endif

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	/* Give chance for processes to spawn before starting
	 * the polling. This should largely eliminate the
	 * the chance of having /proc open when the tasks are
	 * spawned, which would prevent a valid checkpoint/restart
	 * with some systems */
	_task_sleep(1);
	while (_init_run_test() && !_jobacct_shutdown_test() &&
	       acct_gather_profile_test()) {
		/* Do this until shutdown is requested */
		slurm_mutex_lock(&acct_gather_profile_timer[type].notify_mutex);
		slurm_cond_wait(
			&acct_gather_profile_timer[type].notify,
			&acct_gather_profile_timer[type].notify_mutex);
		slurm_mutex_unlock(&acct_gather_profile_timer[type].
				   notify_mutex);
		slurm_mutex_lock(&g_context_lock);
		/* The initial poll is done after the last task is added */
		_poll_data(1);
		slurm_mutex_unlock(&g_context_lock);

	}
	return NULL;
}
Exemple #17
0
static void _cancel_jobid_by_state(uint32_t job_state, int filter_cnt, int *rc)
{
	job_cancel_info_t *cancel_info;
	job_info_t *job_ptr;
	pthread_t dummy;
	int err, i, j;

	if (opt.job_cnt == 0)
		return;

	for (j = 0; j < opt.job_cnt; j++) {
		if (opt.job_id[j] == 0)
			continue;
		if ((job_state == JOB_PENDING) && !opt.job_pend[j])
			continue;

		job_ptr = job_buffer_ptr->job_array;
		for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
			if (IS_JOB_FINISHED(job_ptr))
				job_ptr->job_id = 0;
			if (job_ptr->job_id == 0)
				continue;
			if ((opt.step_id[j] != SLURM_BATCH_SCRIPT) &&
			    IS_JOB_PENDING(job_ptr)) {
				/* User specified #.# for step, but the job ID
				 * may be job array leader with part of job
				 * array running with other tasks pending */
				continue;
			}

			opt.job_found[j] = false;
			if (opt.array_id[j] == NO_VAL) {
				if ((opt.job_id[j] == job_ptr->job_id) ||
				    ((opt.job_id[j] == job_ptr->array_job_id) &&
				     (opt.step_id[j] == SLURM_BATCH_SCRIPT))) {
					opt.job_found[j] = true;
				}
			} else if (opt.array_id[j] == INFINITE) {
				if (opt.job_id[j] == job_ptr->array_job_id) {
					opt.job_found[j] = true;
				}
			} else if (opt.job_id[j] != job_ptr->array_job_id) {
				continue;
			} else if (_is_task_in_job(job_ptr, opt.array_id[j])) {
				opt.job_found[j] = true;
			}

			if (!opt.job_found[j])
				continue;

			if (opt.interactive &&
			    (_confirmation(job_ptr, opt.step_id[j]) == 0)) {
				job_ptr->job_id = 0;	/* Don't check again */
				continue;
			}

			slurm_mutex_lock(&num_active_threads_lock);
			num_active_threads++;
			while (num_active_threads > MAX_THREADS) {
				slurm_cond_wait(&num_active_threads_cond,
						&num_active_threads_lock);
			}
			slurm_mutex_unlock(&num_active_threads_lock);

			cancel_info = (job_cancel_info_t *)
				      xmalloc(sizeof(job_cancel_info_t));
			cancel_info->rc      = rc;
			cancel_info->sig     = opt.signal;
			cancel_info->num_active_threads = &num_active_threads;
			cancel_info->num_active_threads_lock =
					&num_active_threads_lock;
			cancel_info->num_active_threads_cond =
					&num_active_threads_cond;
			if (opt.step_id[j] == SLURM_BATCH_SCRIPT) {
				cancel_info->job_id_str =
					_build_jobid_str(job_ptr);
				err = pthread_create(&dummy, &attr,
						     _cancel_job_id,
						     cancel_info);
				if (err)  /* Run in-line as needed */
					_cancel_job_id(cancel_info);
				job_ptr->job_id = 0;
			} else {
				cancel_info->job_id = job_ptr->job_id;
				cancel_info->step_id = opt.step_id[j];
				err = pthread_create(&dummy, &attr,
						     _cancel_step_id,
						     cancel_info);
				if (err)  /* Run in-line as needed */
					_cancel_step_id(cancel_info);
			}

			if (opt.interactive) {
				/* Print any error message for first job before
				 * starting confirmation of next job */
				slurm_mutex_lock(&num_active_threads_lock);
				while (num_active_threads > 0) {
					slurm_cond_wait(&num_active_threads_cond,
							&num_active_threads_lock);
				}
				slurm_mutex_unlock(&num_active_threads_lock);
			}
		}
	}
}
Exemple #18
0
static void _launch_app(srun_job_t *job, List srun_job_list, bool got_alloc)
{
	ListIterator opt_iter, job_iter;
	opt_t *opt_local = NULL;
	_launch_app_data_t *opts;
	int total_ntasks = 0, total_nnodes = 0, step_cnt = 0, node_offset = 0;
	pthread_mutex_t step_mutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_cond_t step_cond   = PTHREAD_COND_INITIALIZER;
	srun_job_t *first_job = NULL;
	char *launch_type, *pack_node_list = NULL;
	bool need_mpir = false;
	uint16_t *tmp_task_cnt = NULL, *pack_task_cnts = NULL;
	uint32_t **tmp_tids = NULL, **pack_tids = NULL;

	launch_type = slurm_get_launch_type();
	if (launch_type && strstr(launch_type, "slurm"))
		need_mpir = true;
	xfree(launch_type);

	if (srun_job_list) {
		int pack_step_cnt = list_count(srun_job_list);
		first_job = (srun_job_t *) list_peek(srun_job_list);
		if (!opt_list) {
			if (first_job)
				fini_srun(first_job, got_alloc, &global_rc, 0);
			fatal("%s: have srun_job_list, but no opt_list",
			      __func__);
		}

		job_iter = list_iterator_create(srun_job_list);
		while ((job = (srun_job_t *) list_next(job_iter))) {
			char *node_list = NULL;
			int i, node_inx;
			total_ntasks += job->ntasks;
			total_nnodes += job->nhosts;

			xrealloc(pack_task_cnts, sizeof(uint16_t)*total_nnodes);
			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_TASKS,
						  &tmp_task_cnt);
			if (!tmp_task_cnt) {
				fatal("%s: job %u has NULL task array",
				      __func__, job->jobid);
				break;	/* To eliminate CLANG error */
			}
			memcpy(pack_task_cnts + node_offset, tmp_task_cnt,
			       sizeof(uint16_t) * job->nhosts);

			xrealloc(pack_tids, sizeof(uint32_t *) * total_nnodes);
			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_TIDS,
						  &tmp_tids);
			if (!tmp_tids) {
				fatal("%s: job %u has NULL task ID array",
				      __func__, job->jobid);
				break;	/* To eliminate CLANG error */
			}
			for (node_inx = 0; node_inx < job->nhosts; node_inx++) {
				uint32_t *node_tids;
				node_tids = xmalloc(sizeof(uint32_t) *
						    tmp_task_cnt[node_inx]);
				for (i = 0; i < tmp_task_cnt[node_inx]; i++) {
					node_tids[i] = tmp_tids[node_inx][i] +
						       job->pack_task_offset;
				}
				pack_tids[node_offset + node_inx] =
					node_tids;
			}

			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_NODE_LIST,
						  &node_list);
			if (!node_list) {
				fatal("%s: job %u has NULL hostname",
				      __func__, job->jobid);
			}
			if (pack_node_list)
				xstrfmtcat(pack_node_list, ",%s", node_list);
			else
				pack_node_list = xstrdup(node_list);
			xfree(node_list);
			node_offset += job->nhosts;
		}
		list_iterator_reset(job_iter);
		_reorder_pack_recs(&pack_node_list, &pack_task_cnts,
				   &pack_tids, total_nnodes);

		if (need_mpir)
			mpir_init(total_ntasks);

		opt_iter = list_iterator_create(opt_list);
		while ((opt_local = (opt_t *) list_next(opt_iter))) {
			job = (srun_job_t *) list_next(job_iter);
			if (!job) {
				slurm_mutex_lock(&step_mutex);
				while (step_cnt > 0)
					slurm_cond_wait(&step_cond,&step_mutex);
				slurm_mutex_unlock(&step_mutex);
				if (first_job) {
					fini_srun(first_job, got_alloc,
						  &global_rc, 0);
				}
				fatal("%s: job allocation count does not match request count (%d != %d)",
				      __func__, list_count(srun_job_list),
				      list_count(opt_list));
				break;	/* To eliminate CLANG error */
			}

			slurm_mutex_lock(&step_mutex);
			step_cnt++;
			slurm_mutex_unlock(&step_mutex);
			job->pack_node_list = xstrdup(pack_node_list);
			if ((pack_step_cnt > 1) && pack_task_cnts) {
				xassert(node_offset == job->pack_nnodes);
				job->pack_task_cnts = xmalloc(sizeof(uint16_t) *
							      job->pack_nnodes);
				memcpy(job->pack_task_cnts, pack_task_cnts,
				       sizeof(uint16_t) * job->pack_nnodes);
				job->pack_tids = xmalloc(sizeof(uint32_t *) *
							 job->pack_nnodes);
				memcpy(job->pack_tids, pack_tids,
				       sizeof(uint32_t *) * job->pack_nnodes);
			}
			opts = xmalloc(sizeof(_launch_app_data_t));
			opts->got_alloc   = got_alloc;
			opts->job         = job;
			opts->opt_local   = opt_local;
			opts->step_cond   = &step_cond;
			opts->step_cnt    = &step_cnt;
			opts->step_mutex  = &step_mutex;
			opt_local->pack_step_cnt = pack_step_cnt;

			slurm_thread_create_detached(NULL, _launch_one_app,
						     opts);
		}
		xfree(pack_node_list);
		xfree(pack_task_cnts);
		list_iterator_destroy(job_iter);
		list_iterator_destroy(opt_iter);
		slurm_mutex_lock(&step_mutex);
		while (step_cnt > 0)
			slurm_cond_wait(&step_cond, &step_mutex);
		slurm_mutex_unlock(&step_mutex);

		if (first_job)
			fini_srun(first_job, got_alloc, &global_rc, 0);
	} else {
		if (need_mpir)
			mpir_init(job->ntasks);
		opts = xmalloc(sizeof(_launch_app_data_t));
		opts->got_alloc   = got_alloc;
		opts->job         = job;
		opts->opt_local   = &opt;
		opt.pack_step_cnt = 1;
		_launch_one_app(opts);
		fini_srun(job, got_alloc, &global_rc, 0);
	}
}
int
main(int argc, char **argv)
{
	int debug_level, sig, srun_fd;
	struct sigaction sa;
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	struct sockaddr_un ca;
	unsigned int ca_len = sizeof(ca);

	atexit(remove_listen_socket);

	/* copied from srun */
	slurm_conf_init(NULL);
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(argv[0]), logopt, 0, NULL);

	if (init_srun_argv(argc, argv)) {
		fatal("failed to initialize arguments for running srun");
	}

	if ((cr_id = cr_init()) < 0) {
		fatal("failed to initialize libcr: %s", cr_strerror(errno));
	}
	(void)cr_register_callback(cr_callback, NULL, CR_THREAD_CONTEXT);

	/* forward signals. copied from cr_restart */
	sa.sa_sigaction = signal_child;
	sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO;
	sigemptyset(&sa.sa_mask);
	for (sig = 0;  sig < _NSIG; sig ++) {
		if (sig == SIGSTOP ||
		    sig == SIGKILL ||
		    sig == SIGCHLD)
			continue;
		sigaction(sig, &sa, NULL);
	}
	sa.sa_sigaction = on_child_exit;
	sa.sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP;
	sigaction(SIGCHLD, &sa, NULL);

	cr_enter_cs(cr_id); /* BEGIN CS: avoid race condition of whether srun is forked */
	if ( fork_exec_srun() ) {
		fatal("failed fork/exec/wait srun");
	}
	cr_leave_cs(cr_id); /* END CS */

	while (1) {
		slurm_mutex_lock(&step_launch_mutex);
		while (step_launched) {
			/* just avoid busy waiting */
			slurm_cond_wait(&step_launch_cond, &step_launch_mutex);
		}
		slurm_mutex_unlock(&step_launch_mutex);

		if (_wait_for_srun_connect() < 0)
			continue;

		cr_enter_cs(cr_id); /* BEGIN CS: checkpoint(callback) will be delayed */

		srun_fd = accept(listen_fd, (struct sockaddr*)&ca, &ca_len);
		if (srun_fd < 0) {
			/* restarted before enter CS. socket will not be restored */
			if (errno == EBADF) {
				cr_leave_cs(cr_id);
				continue;
			} else {
				fatal("failed to accept socket: %m");
			}
		}

		_read_info_from_srun(srun_fd);
		close(srun_fd);

		step_launched = 1;
		debug2("step launched");

		cr_leave_cs(cr_id); /* END CS */
	}

	return 0;
}
Exemple #20
0
/* Checkpoint processing pthread
 * Never returns, but is cancelled on plugin termiantion */
static void *_ckpt_agent_thr(void *arg)
{
	struct ckpt_req *req = (struct ckpt_req *)arg;
	int rc;
	/* Locks: write job */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
	struct job_record *job_ptr;
	struct step_record *step_ptr;
	struct check_job_info *check_ptr;

	/* only perform ckpt operation of ONE JOB */
	slurm_mutex_lock(&ckpt_agent_mutex);
	while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) {
		slurm_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex);
	}
	ckpt_agent_jobid = req->job_id;
	ckpt_agent_count ++;
	slurm_mutex_unlock(&ckpt_agent_mutex);

	debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u",
	       req->op, req->job_id, req->step_id);

	rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time,
			      req->image_dir, req->wait, req->nodelist);
	if (rc != SLURM_SUCCESS) {
		error("checkpoint/blcr: error on checkpoint request %u to "
		      "%u.%u: %s", req->op, req->job_id, req->step_id,
		      slurm_strerror(rc));
	}
	if (req->op == CHECK_REQUEUE)
		_requeue_when_finished(req->job_id);

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(req->job_id);
	if (!job_ptr) {
		error("_ckpt_agent_thr: job finished");
		goto out;
	}
	if (req->step_id == SLURM_BATCH_SCRIPT) {	/* batch job */
		check_ptr = (struct check_job_info *)job_ptr->check_job;
	} else {
		step_ptr = find_step_record(job_ptr, req->step_id);
		if (! step_ptr) {
			error("_ckpt_agent_thr: step finished");
			goto out;
		}
		check_ptr = (struct check_job_info *)step_ptr->check_job;
	}
	check_ptr->time_stamp = 0;
	check_ptr->error_code = rc;
	if (check_ptr->error_code != SLURM_SUCCESS)
		check_ptr->error_msg = xstrdup(slurm_strerror(rc));

 out:

	if (req->sig_done) {
		_send_sig(req->job_id, req->step_id, req->sig_done,
			  req->nodelist);
	}
	unlock_slurmctld(job_write_lock);

	_on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id,
			  req->image_dir, rc);

	slurm_mutex_lock(&ckpt_agent_mutex);
	ckpt_agent_count --;
	if (ckpt_agent_count == 0) {
		ckpt_agent_jobid = 0;
		slurm_cond_broadcast(&ckpt_agent_cond);
	}
	slurm_mutex_unlock(&ckpt_agent_mutex);
	_ckpt_req_free(req);
	return NULL;
}
Exemple #21
0
static void *_agent(void *x)
{
	int cnt, rc;
	Buf buffer;
	struct timespec abs_time;
	static time_t fail_time = 0;
	int sigarray[] = {SIGUSR1, 0};
	slurmdbd_msg_t list_req;
	dbd_list_msg_t list_msg;

	list_req.msg_type = DBD_SEND_MULT_MSG;
	list_req.data = &list_msg;
	memset(&list_msg, 0, sizeof(dbd_list_msg_t));
	/* DEF_TIMERS; */

	/* Prepare to catch SIGUSR1 to interrupt pending
	 * I/O and terminate in a timely fashion. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	while (*slurmdbd_conn->shutdown == 0) {
		/* START_TIMER; */
		slurm_mutex_lock(&slurmdbd_lock);
		if (halt_agent)
			slurm_cond_wait(&slurmdbd_cond, &slurmdbd_lock);

		if ((slurmdbd_conn->fd < 0) &&
		    (difftime(time(NULL), fail_time) >= 10)) {
			/* The connection to Slurm DBD is not open */
			_open_slurmdbd_conn(1);
			if (slurmdbd_conn->fd < 0)
				fail_time = time(NULL);
		}

		slurm_mutex_lock(&agent_lock);
		if (agent_list && slurmdbd_conn->fd)
			cnt = list_count(agent_list);
		else
			cnt = 0;
		if ((cnt == 0) || (slurmdbd_conn->fd < 0) ||
		    (fail_time && (difftime(time(NULL), fail_time) < 10))) {
			slurm_mutex_unlock(&slurmdbd_lock);
			abs_time.tv_sec  = time(NULL) + 10;
			abs_time.tv_nsec = 0;
			slurm_cond_timedwait(&agent_cond, &agent_lock,
					     &abs_time);
			slurm_mutex_unlock(&agent_lock);
			continue;
		} else if ((cnt > 0) && ((cnt % 100) == 0))
			info("slurmdbd: agent queue size %u", cnt);
		/* Leave item on the queue until processing complete */
		if (agent_list) {
			int handle_agent_count = 1000;
			if (cnt > handle_agent_count) {
				int agent_count = 0;
				ListIterator agent_itr =
					list_iterator_create(agent_list);
				list_msg.my_list = list_create(NULL);
				while ((buffer = list_next(agent_itr))) {
					list_enqueue(list_msg.my_list, buffer);
					agent_count++;
					if (agent_count > handle_agent_count)
						break;
				}
				list_iterator_destroy(agent_itr);
				buffer = pack_slurmdbd_msg(
					&list_req, SLURM_PROTOCOL_VERSION);
			} else if (cnt > 1) {
				list_msg.my_list = agent_list;
				buffer = pack_slurmdbd_msg(
					&list_req, SLURM_PROTOCOL_VERSION);
			} else
				buffer = (Buf) list_peek(agent_list);
		} else
			buffer = NULL;
		slurm_mutex_unlock(&agent_lock);
		if (buffer == NULL) {
			slurm_mutex_unlock(&slurmdbd_lock);

			slurm_mutex_lock(&assoc_cache_mutex);
			if (slurmdbd_conn->fd >= 0 && running_cache)
				slurm_cond_signal(&assoc_cache_cond);
			slurm_mutex_unlock(&assoc_cache_mutex);

			continue;
		}

		/* NOTE: agent_lock is clear here, so we can add more
		 * requests to the queue while waiting for this RPC to
		 * complete. */
		rc = slurm_persist_send_msg(slurmdbd_conn, buffer);
		if (rc != SLURM_SUCCESS) {
			if (*slurmdbd_conn->shutdown) {
				slurm_mutex_unlock(&slurmdbd_lock);
				break;
			}
			error("slurmdbd: Failure sending message: %d: %m", rc);
		} else if (list_msg.my_list) {
			rc = _handle_mult_rc_ret();
		} else {
			rc = _get_return_code();
			if (rc == EAGAIN) {
				if (*slurmdbd_conn->shutdown) {
					slurm_mutex_unlock(&slurmdbd_lock);
					break;
				}
				error("slurmdbd: Failure with "
				      "message need to resend: %d: %m", rc);
			}
		}
		slurm_mutex_unlock(&slurmdbd_lock);
		slurm_mutex_lock(&assoc_cache_mutex);
		if (slurmdbd_conn->fd >= 0 && running_cache)
			slurm_cond_signal(&assoc_cache_cond);
		slurm_mutex_unlock(&assoc_cache_mutex);

		slurm_mutex_lock(&agent_lock);
		if (agent_list && (rc == SLURM_SUCCESS)) {
			/*
			 * If we sent a mult_msg we just need to free buffer,
			 * we don't need to requeue, just mark list_msg.my_list
			 * as NULL as that is the sign we sent a mult_msg.
			 */
			if (list_msg.my_list) {
				if (list_msg.my_list != agent_list)
					FREE_NULL_LIST(list_msg.my_list);
				list_msg.my_list = NULL;
			} else
				buffer = (Buf) list_dequeue(agent_list);

			free_buf(buffer);
			fail_time = 0;
		} else {
			/* We need to free a mult_msg even on failure */
			if (list_msg.my_list) {
				if (list_msg.my_list != agent_list)
					FREE_NULL_LIST(list_msg.my_list);
				list_msg.my_list = NULL;
				free_buf(buffer);
			}

			fail_time = time(NULL);
		}
		slurm_mutex_unlock(&agent_lock);
		/* END_TIMER; */
		/* info("at the end with %s", TIME_STR); */
	}

	slurm_mutex_lock(&agent_lock);
	_save_dbd_state();
	FREE_NULL_LIST(agent_list);
	slurm_mutex_unlock(&agent_lock);
	return NULL;
}
Exemple #22
0
/*
 * _msg_aggregation_sender()
 *
 *  Start and terminate message collection windows.
 *  Send collected msgs to next collector node or final destination
 *  at window expiration.
 */
static void * _msg_aggregation_sender(void *arg)
{
	struct timeval now;
	struct timespec timeout;
	slurm_msg_t msg;
	composite_msg_t cmp;

	msg_collection.running = 1;

	slurm_mutex_lock(&msg_collection.mutex);

	while (msg_collection.running) {
		/* Wait for a new msg to be collected */
		slurm_cond_wait(&msg_collection.cond, &msg_collection.mutex);


		if (!msg_collection.running &&
		    !list_count(msg_collection.msg_list))
			break;

		/* A msg has been collected; start new window */
		gettimeofday(&now, NULL);
		timeout.tv_sec = now.tv_sec + (msg_collection.window / 1000);
		timeout.tv_nsec = (now.tv_usec * 1000) +
			(1000000 * (msg_collection.window % 1000));
		timeout.tv_sec += timeout.tv_nsec / 1000000000;
		timeout.tv_nsec %= 1000000000;

		slurm_cond_timedwait(&msg_collection.cond,
				     &msg_collection.mutex, &timeout);

		if (!msg_collection.running &&
		    !list_count(msg_collection.msg_list))
			break;

		msg_collection.max_msgs = true;

		/* Msg collection window has expired and message collection
		 * is suspended; now build and send composite msg */
		memset(&msg, 0, sizeof(slurm_msg_t));
		memset(&cmp, 0, sizeof(composite_msg_t));

		memcpy(&cmp.sender, &msg_collection.node_addr,
		       sizeof(slurm_addr_t));
		cmp.msg_list = msg_collection.msg_list;

		msg_collection.msg_list =
			list_create(slurm_free_comp_msg_list);
		msg_collection.max_msgs = false;

		slurm_msg_t_init(&msg);
		msg.msg_type = MESSAGE_COMPOSITE;
		msg.protocol_version = SLURM_PROTOCOL_VERSION;
		msg.data = &cmp;
		if (_send_to_next_collector(&msg) != SLURM_SUCCESS) {
			error("_msg_aggregation_engine: Unable to send "
			      "composite msg: %m");
		}
		FREE_NULL_LIST(cmp.msg_list);

		/* Resume message collection */
		slurm_cond_broadcast(&msg_collection.cond);
	}

	slurm_mutex_unlock(&msg_collection.mutex);
	return NULL;
}
Exemple #23
0
extern void msg_aggr_add_msg(slurm_msg_t *msg, bool wait,
			     void (*resp_callback) (slurm_msg_t *msg))
{
	int count;
	static uint16_t msg_index = 1;
	static uint32_t wait_count = 0;

	if (!msg_collection.running)
		return;

	slurm_mutex_lock(&msg_collection.mutex);
	if (msg_collection.max_msgs == true) {
		slurm_cond_wait(&msg_collection.cond, &msg_collection.mutex);
	}

	msg->msg_index = msg_index++;

	/* Add msg to message collection */
	list_append(msg_collection.msg_list, msg);

	count = list_count(msg_collection.msg_list);


	/* First msg in collection; initiate new window */
	if (count == 1)
		slurm_cond_signal(&msg_collection.cond);

	/* Max msgs reached; terminate window */
	if (count >= msg_collection.max_msg_cnt) {
		msg_collection.max_msgs = true;
		slurm_cond_signal(&msg_collection.cond);
	}
	slurm_mutex_unlock(&msg_collection.mutex);

	if (wait) {
		msg_aggr_t *msg_aggr = xmalloc(sizeof(msg_aggr_t));
		uint16_t        msg_timeout;
		struct timeval  now;
		struct timespec timeout;

		msg_aggr->msg_index = msg->msg_index;
		msg_aggr->resp_callback = resp_callback;
		slurm_cond_init(&msg_aggr->wait_cond, NULL);

		slurm_mutex_lock(&msg_collection.aggr_mutex);
		list_append(msg_collection.msg_aggr_list, msg_aggr);

		msg_timeout = slurm_get_msg_timeout();
		gettimeofday(&now, NULL);
		timeout.tv_sec = now.tv_sec + msg_timeout;
		timeout.tv_nsec = now.tv_usec * 1000;

		wait_count++;
		if (pthread_cond_timedwait(&msg_aggr->wait_cond,
					   &msg_collection.aggr_mutex,
					   &timeout) == ETIMEDOUT)
			_handle_msg_aggr_ret(msg_aggr->msg_index, 1);
		wait_count--;
		slurm_mutex_unlock(&msg_collection.aggr_mutex);
;
		if (!msg_collection.running && !wait_count)
			slurm_mutex_destroy(&msg_collection.aggr_mutex);

		_msg_aggr_free(msg_aggr);
	}
}
Exemple #24
0
static void
_cancel_jobs_by_state(uint32_t job_state, int filter_cnt, int *rc)
{
	int i, err;
	job_cancel_info_t *cancel_info;
	job_info_t *job_ptr = job_buffer_ptr->job_array;
	pthread_t dummy;

	/* Spawn a thread to cancel each job or job step marked for
	 * cancellation */
	if (opt.job_cnt) {
		_cancel_jobid_by_state(job_state, filter_cnt, rc);
		return;
	}

	for (i = 0; i < job_buffer_ptr->record_count; i++, job_ptr++) {
		if (IS_JOB_FINISHED(job_ptr))
			job_ptr->job_id = 0;
		if (job_ptr->job_id == 0)
			continue;

		if ((job_state < JOB_END) &&
		    (job_ptr->job_state != job_state))
			continue;

		if (opt.interactive &&
		    (_confirmation(job_ptr, SLURM_BATCH_SCRIPT) == 0)) {
			job_ptr->job_id = 0;
			continue;
		}

		cancel_info = (job_cancel_info_t *)
			xmalloc(sizeof(job_cancel_info_t));
		cancel_info->job_id_str = _build_jobid_str(job_ptr);
		cancel_info->rc      = rc;
		cancel_info->sig     = opt.signal;
		cancel_info->num_active_threads = &num_active_threads;
		cancel_info->num_active_threads_lock =
			&num_active_threads_lock;
		cancel_info->num_active_threads_cond =
			&num_active_threads_cond;

		slurm_mutex_lock(&num_active_threads_lock);
		num_active_threads++;
		while (num_active_threads > MAX_THREADS) {
			slurm_cond_wait(&num_active_threads_cond,
					&num_active_threads_lock);
		}
		slurm_mutex_unlock(&num_active_threads_lock);

		err = pthread_create(&dummy, &attr, _cancel_job_id,cancel_info);
		if (err)   /* Run in-line if thread create fails */
			_cancel_job_id(cancel_info);
		job_ptr->job_id = 0;

		if (opt.interactive) {
			/* Print any error message for first job before
			 * starting confirmation of next job */
			slurm_mutex_lock(&num_active_threads_lock);
			while (num_active_threads > 0) {
				slurm_cond_wait(&num_active_threads_cond,
						&num_active_threads_lock);
			}
			slurm_mutex_unlock(&num_active_threads_lock);
		}
	}
}