示例#1
0
static void *_monitor(void *arg)
{
	stepd_step_rec_t *job = (stepd_step_rec_t *)arg;
	struct timespec ts = {0, 0};
	int rc;

	debug2("step_terminate_monitor will run for %d secs", timeout);

	ts.tv_sec = time(NULL) + 1 + timeout;

	slurm_mutex_lock(&lock);
	if (stop_flag)
		goto done;

	rc = pthread_cond_timedwait(&cond, &lock, &ts);
	if (rc == ETIMEDOUT) {
		char entity[24], time_str[24];
		time_t now = time(NULL);
		int rc;

		_call_external_program(job);

		if (job->stepid == SLURM_BATCH_SCRIPT) {
			snprintf(entity, sizeof(entity),
				 "JOB %u", job->jobid);
		} else if (job->stepid == SLURM_EXTERN_CONT) {
			snprintf(entity, sizeof(entity),
				 "EXTERN STEP FOR %u", job->jobid);
		} else {
			snprintf(entity, sizeof(entity), "STEP %u.%u",
				 job->jobid, job->stepid);
		}
		slurm_make_time_str(&now, time_str, sizeof(time_str));

		if (job->state < SLURMSTEPD_STEP_RUNNING) {
			error("*** %s STEPD TERMINATED ON %s AT %s DUE TO JOB NOT RUNNING ***",
			      entity, job->node_name, time_str);
			rc = ESLURMD_JOB_NOTRUNNING;
		} else {
			error("*** %s STEPD TERMINATED ON %s AT %s DUE TO JOB NOT ENDING WITH SIGNALS ***",
			      entity, job->node_name, time_str);
			rc = ESLURMD_KILL_TASK_FAILED;
		}

	        exit(stepd_cleanup(NULL, job, NULL, NULL, rc, 0));
	} else if (rc != 0) {
		error("Error waiting on condition in _monitor: %m");
	}
done:
	slurm_mutex_unlock(&lock);

	debug2("step_terminate_monitor is stopping");
	return NULL;
}
示例#2
0
int
main (int argc, char **argv)
{
	slurm_addr_t *cli;
	slurm_addr_t *self;
	slurm_msg_t *msg;
	stepd_step_rec_t *job;
	int ngids;
	gid_t *gids;
	int rc = 0;
	char *launch_params;

	if (_process_cmdline (argc, argv) < 0)
		fatal ("Error in slurmstepd command line");

	xsignal_block(slurmstepd_blocked_signals);
	conf = xmalloc(sizeof(*conf));
	conf->argv = &argv;
	conf->argc = &argc;
	init_setproctitle(argc, argv);
	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );
	if (slurm_auth_init(NULL) != SLURM_SUCCESS)
		fatal( "failed to initialize authentication plugin" );

	/* Receive job parameters from the slurmd */
	_init_from_slurmd(STDIN_FILENO, argv, &cli, &self, &msg,
			  &ngids, &gids);

	/* Create the stepd_step_rec_t, mostly from info in a
	 * launch_tasks_request_msg_t or a batch_job_launch_msg_t */
	if (!(job = _step_setup(cli, self, msg))) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}
	job->ngids = ngids;
	job->gids = gids;

	/* fork handlers cause mutexes on some global data structures
	 * to be re-initialized after the fork. */
	list_install_fork_handlers();
	slurm_conf_install_fork_handlers();

	/* sets job->msg_handle and job->msgid */
	if (msg_thr_create(job) == SLURM_ERROR) {
		_send_fail_to_slurmd(STDOUT_FILENO);
		rc = SLURM_FAILURE;
		goto ending;
	}

	_send_ok_to_slurmd(STDOUT_FILENO);
	_got_ack_from_slurmd(STDIN_FILENO);

	/* Fancy way of closing stdin that keeps STDIN_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDIN_FILENO);

	/* Fancy way of closing stdout that keeps STDOUT_FILENO from being
	 * allocated to any random file.  The slurmd already opened /dev/null
	 * on STDERR_FILENO for us. */
	dup2(STDERR_FILENO, STDOUT_FILENO);

	/* slurmstepd is the only daemon that should survive upgrade. If it
	 * had been swapped out before upgrade happened it could easily lead
	 * to SIGBUS at any time after upgrade. Avoid that by locking it
	 * in-memory. */
	launch_params = slurm_get_launch_params();
	if (launch_params && strstr(launch_params, "slurmstepd_memlock")) {
#ifdef _POSIX_MEMLOCK
		int flags = MCL_CURRENT;
		if (strstr(launch_params, "slurmstepd_memlock_all"))
			flags |= MCL_FUTURE;
		if (mlockall(flags) < 0)
			info("failed to mlock() slurmstepd pages: %m");
		else
			debug("slurmstepd locked in memory");
#else
		info("mlockall() system call does not appear to be available");
#endif
	}
	xfree(launch_params);

	/* This does most of the stdio setup, then launches all the tasks,
	 * and blocks until the step is complete */
	rc = job_manager(job);

	return stepd_cleanup(msg, job, cli, self, rc, 0);
ending:
	return stepd_cleanup(msg, job, cli, self, rc, 1);
}