Beispiel #1
0
static void *_launch_one_app(void *data)
{
	static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER;
	static pthread_cond_t  launch_cond  = PTHREAD_COND_INITIALIZER;
	static bool            launch_begin = false;
	static bool            launch_fini  = false;
	_launch_app_data_t *opts = (_launch_app_data_t *) data;
	opt_t *opt_local = opts->opt_local;
	srun_job_t *job  = opts->job;
	bool got_alloc   = opts->got_alloc;
	slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER;
	slurm_step_launch_callbacks_t step_callbacks;

	memset(&step_callbacks, 0, sizeof(step_callbacks));
	step_callbacks.step_signal = launch_g_fwd_signal;

	/*
	 * Run pre-launch once for entire pack job
	 */
	slurm_mutex_lock(&launch_mutex);
	if (!launch_begin) {
		launch_begin = true;
		slurm_mutex_unlock(&launch_mutex);

		pre_launch_srun_job(job, 0, 1, opt_local);

		slurm_mutex_lock(&launch_mutex);
		launch_fini = true;
		slurm_cond_broadcast(&launch_cond);
	} else {
		while (!launch_fini)
			slurm_cond_wait(&launch_cond, &launch_mutex);
	}
	slurm_mutex_unlock(&launch_mutex);

relaunch:
	launch_common_set_stdio_fds(job, &cio_fds, opt_local);

	if (!launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks,
				  opt_local)) {
		if (launch_g_step_wait(job, got_alloc, opt_local) == -1)
			goto relaunch;
	}

	if (opts->step_mutex) {
		slurm_mutex_lock(opts->step_mutex);
		(*opts->step_cnt)--;
		slurm_cond_broadcast(opts->step_cond);
		slurm_mutex_unlock(opts->step_mutex);
	}
	xfree(data);
	return NULL;
}
Beispiel #2
0
/* _wr_wrunlock - Issue a write unlock on the specified data type */
static void _wr_wrunlock(lock_datatype_t datatype)
{
	slurm_mutex_lock(&locks_mutex);
	slurmctld_locks.entity[write_lock(datatype)]--;
	slurm_cond_broadcast(&locks_cond);
	slurm_mutex_unlock(&locks_mutex);
}
Beispiel #3
0
static void _shutdown_agent(void)
{
	int i;

	if (agent_tid) {
		slurmdbd_shutdown = time(NULL);
		for (i=0; i<50; i++) {	/* up to 5 secs total */
			slurm_cond_broadcast(&agent_cond);
			usleep(100000);	/* 0.1 sec per try */
			if (pthread_kill(agent_tid, SIGUSR1))
				break;

		}
		/* On rare occasions agent thread may not end quickly,
		 * perhaps due to communication problems with slurmdbd.
		 * Cancel it and join before returning or we could remove
		 * and leave the agent without valid data */
		if (pthread_kill(agent_tid, 0) == 0) {
			error("slurmdbd: agent failed to shutdown gracefully");
			error("slurmdbd: unable to save pending requests");
			pthread_cancel(agent_tid);
		}
		pthread_join(agent_tid,  NULL);
		agent_tid = 0;
	}
}
static int
cr_callback(void *unused)
{
	int rc;
	char *step_image_dir = NULL;

	rc = CR_CHECKPOINT_READY;
	if (step_launched) {
		step_image_dir = get_step_image_dir(1);
		if (step_image_dir == NULL) {
			error ("failed to get step image directory");
			rc = CR_CHECKPOINT_PERM_FAILURE;
		} else if (slurm_checkpoint_tasks(jobid,
						  stepid,
						  time(NULL), /* timestamp */
						  step_image_dir,
						  60, /* wait */
						  nodelist) != SLURM_SUCCESS) {
			error ("failed to checkpoint step tasks");
			rc = CR_CHECKPOINT_PERM_FAILURE;
		}
		xfree(step_image_dir);
	}
	rc = cr_checkpoint(rc);	/* dump */

	if (rc < 0) {
		fatal("checkpoint failed: %s", cr_strerror(errno));
	} else if (rc == 0) {
		/* continue, nothing to do */
	} else {
		/* restarted */
		if (srun_pid) { /* srun forked */
			if (step_launched) {
				step_image_dir = get_step_image_dir(0);
				if (step_image_dir == NULL) {
					fatal("failed to get step image directory");
				}
				update_env("SLURM_RESTART_DIR", step_image_dir);
				xfree(step_image_dir);
			}

			if (fork_exec_srun()) {
				fatal("failed fork/exec srun");
			}
		}

		/* XXX: step_launched => listen_fd valid */
		step_launched = 0;

		debug2("step not launched.");

		slurm_cond_broadcast(&step_launch_cond);
	}

	return 0;
}
Beispiel #5
0
static int _wait_for_thread (pthread_t thread_id)
{
	int i;

	for (i=0; i<20; i++) {
		slurm_cond_broadcast(&comp_list_cond);
		usleep(1000 * i);
		if (pthread_kill(thread_id, 0))
			return SLURM_SUCCESS;
	}

	error("Could not kill jobcomp script pthread");
	return SLURM_ERROR;
}
Beispiel #6
0
int slurm_jobcomp_log_record (struct job_record *record)
{
	struct jobcomp_info * job;

	debug3("Entering slurm_jobcomp_log_record");

	if (!(job = _jobcomp_info_create (record)))
		return error ("jobcomp/script: Failed to create job info!");

	slurm_mutex_lock(&comp_list_mutex);
	list_append(comp_list, job);
	slurm_cond_broadcast(&comp_list_cond);
	slurm_mutex_unlock(&comp_list_mutex);

	return SLURM_SUCCESS;
}
Beispiel #7
0
/* my_tid IN - Thread ID of spawned thread, 0 if no thread spawned */
extern void slurm_persist_conn_free_thread_loc(int thread_loc)
{
	/* we will handle this in the fini */
	if (shutdown_time)
		return;

	slurm_mutex_lock(&thread_count_lock);
	if (thread_count > 0)
		thread_count--;
	else
		error("thread_count underflow");

	_destroy_persist_service(persist_service_conn[thread_loc]);
	persist_service_conn[thread_loc] = NULL;

	slurm_cond_broadcast(&thread_count_cond);
	slurm_mutex_unlock(&thread_count_lock);
}
Beispiel #8
0
/* _wr_rdlock - Issue a read lock on the specified data type
 *	Wait until there are no write locks AND
 *	no pending write locks (write_wait_lock == 0)
 *
 *	NOTE: Always favoring write locks can result in starvation for
 *	read locks. To prevent this, read locks were permitted to be satisified
 *	after 10 consecutive write locks. This prevented starvation, but
 *	deadlock has been observed with some values for the count. */
static bool _wr_rdlock(lock_datatype_t datatype, bool wait_lock)
{
	bool success = true;

	slurm_mutex_lock(&locks_mutex);
	while (1) {
#if 1
		if ((slurmctld_locks.entity[write_lock(datatype)] == 0) &&
		    (slurmctld_locks.entity[write_wait_lock(datatype)] == 0)) {
#else
		/* SEE NOTE ABOVE */
		if ((slurmctld_locks.entity[write_lock(datatype)] == 0) &&
		    ((slurmctld_locks.entity[write_wait_lock(datatype)] == 0) ||
		     (slurmctld_locks.entity[write_cnt_lock(datatype)] > 10))) {
#endif
			slurmctld_locks.entity[read_lock(datatype)]++;
			slurmctld_locks.entity[write_cnt_lock(datatype)] = 0;
			break;
		} else if (!wait_lock) {
			success = false;
			break;
		} else {	/* wait for state change and retry */
			slurm_cond_wait(&locks_cond, &locks_mutex);
			if (kill_thread)
				pthread_exit(NULL);
		}
	}
	slurm_mutex_unlock(&locks_mutex);
	return success;
}

/* _wr_rdunlock - Issue a read unlock on the specified data type */
static void _wr_rdunlock(lock_datatype_t datatype)
{
	slurm_mutex_lock(&locks_mutex);
	slurmctld_locks.entity[read_lock(datatype)]--;
	slurm_cond_broadcast(&locks_cond);
	slurm_mutex_unlock(&locks_mutex);
}
Beispiel #9
0
/*
 * _msg_aggregation_sender()
 *
 *  Start and terminate message collection windows.
 *  Send collected msgs to next collector node or final destination
 *  at window expiration.
 */
static void * _msg_aggregation_sender(void *arg)
{
	struct timeval now;
	struct timespec timeout;
	slurm_msg_t msg;
	composite_msg_t cmp;

	msg_collection.running = 1;

	slurm_mutex_lock(&msg_collection.mutex);

	while (msg_collection.running) {
		/* Wait for a new msg to be collected */
		slurm_cond_wait(&msg_collection.cond, &msg_collection.mutex);


		if (!msg_collection.running &&
		    !list_count(msg_collection.msg_list))
			break;

		/* A msg has been collected; start new window */
		gettimeofday(&now, NULL);
		timeout.tv_sec = now.tv_sec + (msg_collection.window / 1000);
		timeout.tv_nsec = (now.tv_usec * 1000) +
			(1000000 * (msg_collection.window % 1000));
		timeout.tv_sec += timeout.tv_nsec / 1000000000;
		timeout.tv_nsec %= 1000000000;

		slurm_cond_timedwait(&msg_collection.cond,
				     &msg_collection.mutex, &timeout);

		if (!msg_collection.running &&
		    !list_count(msg_collection.msg_list))
			break;

		msg_collection.max_msgs = true;

		/* Msg collection window has expired and message collection
		 * is suspended; now build and send composite msg */
		memset(&msg, 0, sizeof(slurm_msg_t));
		memset(&cmp, 0, sizeof(composite_msg_t));

		memcpy(&cmp.sender, &msg_collection.node_addr,
		       sizeof(slurm_addr_t));
		cmp.msg_list = msg_collection.msg_list;

		msg_collection.msg_list =
			list_create(slurm_free_comp_msg_list);
		msg_collection.max_msgs = false;

		slurm_msg_t_init(&msg);
		msg.msg_type = MESSAGE_COMPOSITE;
		msg.protocol_version = SLURM_PROTOCOL_VERSION;
		msg.data = &cmp;
		if (_send_to_next_collector(&msg) != SLURM_SUCCESS) {
			error("_msg_aggregation_engine: Unable to send "
			      "composite msg: %m");
		}
		FREE_NULL_LIST(cmp.msg_list);

		/* Resume message collection */
		slurm_cond_broadcast(&msg_collection.cond);
	}

	slurm_mutex_unlock(&msg_collection.mutex);
	return NULL;
}
Beispiel #10
0
/* kill_locked_threads - Kill all threads waiting on semaphores */
extern void kill_locked_threads(void)
{
	kill_thread = 1;
	slurm_cond_broadcast(&locks_cond);
}
Beispiel #11
0
/* Send an RPC to the SlurmDBD. Do not wait for the reply. The RPC
 * will be queued and processed later if the SlurmDBD is not responding.
 * NOTE: slurm_open_slurmdbd_conn() must have been called with callbacks set
 *
 * Returns SLURM_SUCCESS or an error code */
extern int send_slurmdbd_msg(uint16_t rpc_version, slurmdbd_msg_t *req)
{
	Buf buffer;
	int cnt, rc = SLURM_SUCCESS;
	static time_t syslog_time = 0;
	static int max_agent_queue = 0;

	/*
	 * Whatever our max job count is multiplied by 2 plus node count
	 * multiplied by 4 or MAX_AGENT_QUEUE which ever is bigger.
	 */
	if (!max_agent_queue)
		max_agent_queue =
			MAX(MAX_AGENT_QUEUE,
			    ((slurmctld_conf.max_job_cnt * 2) +
			     (node_record_count * 4)));

	buffer = slurm_persist_msg_pack(
		slurmdbd_conn, (persist_msg_t *)req);
	if (!buffer)	/* pack error */
		return SLURM_ERROR;

	slurm_mutex_lock(&agent_lock);
	if ((agent_tid == 0) || (agent_list == NULL)) {
		_create_agent();
		if ((agent_tid == 0) || (agent_list == NULL)) {
			slurm_mutex_unlock(&agent_lock);
			free_buf(buffer);
			return SLURM_ERROR;
		}
	}
	cnt = list_count(agent_list);
	if ((cnt >= (max_agent_queue / 2)) &&
	    (difftime(time(NULL), syslog_time) > 120)) {
		/* Record critical error every 120 seconds */
		syslog_time = time(NULL);
		error("slurmdbd: agent queue filling (%d), RESTART SLURMDBD NOW",
		      cnt);
		syslog(LOG_CRIT, "*** RESTART SLURMDBD NOW ***");
		if (slurmdbd_conn->trigger_callbacks.dbd_fail)
			(slurmdbd_conn->trigger_callbacks.dbd_fail)();
	}
	if (cnt == (max_agent_queue - 1))
		cnt -= _purge_step_req();
	if (cnt == (max_agent_queue - 1))
		cnt -= _purge_job_start_req();
	if (cnt < max_agent_queue) {
		if (list_enqueue(agent_list, buffer) == NULL)
			fatal("list_enqueue: memory allocation failure");
	} else {
		error("slurmdbd: agent queue is full (%u), discarding %s:%u request",
		      cnt,
		      slurmdbd_msg_type_2_str(req->msg_type, 1),
		      req->msg_type);
		if (slurmdbd_conn->trigger_callbacks.acct_full)
			(slurmdbd_conn->trigger_callbacks.acct_full)();
		free_buf(buffer);
		rc = SLURM_ERROR;
	}

	slurm_cond_broadcast(&agent_cond);
	slurm_mutex_unlock(&agent_lock);
	return rc;
}
Beispiel #12
0
/* Checkpoint processing pthread
 * Never returns, but is cancelled on plugin termiantion */
static void *_ckpt_agent_thr(void *arg)
{
	struct ckpt_req *req = (struct ckpt_req *)arg;
	int rc;
	/* Locks: write job */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };
	struct job_record *job_ptr;
	struct step_record *step_ptr;
	struct check_job_info *check_ptr;

	/* only perform ckpt operation of ONE JOB */
	slurm_mutex_lock(&ckpt_agent_mutex);
	while (ckpt_agent_jobid && ckpt_agent_jobid != req->job_id) {
		slurm_cond_wait(&ckpt_agent_cond, &ckpt_agent_mutex);
	}
	ckpt_agent_jobid = req->job_id;
	ckpt_agent_count ++;
	slurm_mutex_unlock(&ckpt_agent_mutex);

	debug3("checkpoint/blcr: sending checkpoint tasks request %u to %u.%u",
	       req->op, req->job_id, req->step_id);

	rc = checkpoint_tasks(req->job_id, req->step_id, req->begin_time,
			      req->image_dir, req->wait, req->nodelist);
	if (rc != SLURM_SUCCESS) {
		error("checkpoint/blcr: error on checkpoint request %u to "
		      "%u.%u: %s", req->op, req->job_id, req->step_id,
		      slurm_strerror(rc));
	}
	if (req->op == CHECK_REQUEUE)
		_requeue_when_finished(req->job_id);

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(req->job_id);
	if (!job_ptr) {
		error("_ckpt_agent_thr: job finished");
		goto out;
	}
	if (req->step_id == SLURM_BATCH_SCRIPT) {	/* batch job */
		check_ptr = (struct check_job_info *)job_ptr->check_job;
	} else {
		step_ptr = find_step_record(job_ptr, req->step_id);
		if (! step_ptr) {
			error("_ckpt_agent_thr: step finished");
			goto out;
		}
		check_ptr = (struct check_job_info *)step_ptr->check_job;
	}
	check_ptr->time_stamp = 0;
	check_ptr->error_code = rc;
	if (check_ptr->error_code != SLURM_SUCCESS)
		check_ptr->error_msg = xstrdup(slurm_strerror(rc));

 out:

	if (req->sig_done) {
		_send_sig(req->job_id, req->step_id, req->sig_done,
			  req->nodelist);
	}
	unlock_slurmctld(job_write_lock);

	_on_ckpt_complete(req->gid, req->uid, req->job_id, req->step_id,
			  req->image_dir, rc);

	slurm_mutex_lock(&ckpt_agent_mutex);
	ckpt_agent_count --;
	if (ckpt_agent_count == 0) {
		ckpt_agent_jobid = 0;
		slurm_cond_broadcast(&ckpt_agent_cond);
	}
	slurm_mutex_unlock(&ckpt_agent_mutex);
	_ckpt_req_free(req);
	return NULL;
}