コード例 #1
0
ファイル: srun.c プロジェクト: supermanue/slurm
static void _setup_job_env(srun_job_t *job, List srun_job_list, bool got_alloc)
{
	ListIterator opt_iter, job_iter;
	opt_t *opt_local;

	if (srun_job_list) {
		srun_job_t *first_job = list_peek(srun_job_list);
		if (!opt_list) {
			if (first_job)
				fini_srun(first_job, got_alloc, &global_rc, 0);
			fatal("%s: have srun_job_list, but no opt_list",
			      __func__);
		}
		job_iter  = list_iterator_create(srun_job_list);
		opt_iter  = list_iterator_create(opt_list);
		while ((opt_local = (opt_t *) list_next(opt_iter))) {
			job = (srun_job_t *) list_next(job_iter);
			if (!job) {
				if (first_job) {
					fini_srun(first_job, got_alloc,
						  &global_rc, 0);
				}
				fatal("%s: job allocation count does not match request count (%d != %d)",
				      __func__, list_count(srun_job_list),
				      list_count(opt_list));
			}
			_setup_one_job_env(opt_local, job, got_alloc);
		}
		list_iterator_destroy(job_iter);
		list_iterator_destroy(opt_iter);
	} else if (job) {
		_setup_one_job_env(&opt, job, got_alloc);
	} else {
		fatal("%s: No job information", __func__);
	}
}
コード例 #2
0
ファイル: srun.c プロジェクト: edsw/slurm
int srun(int ac, char **av)
{
	int debug_level;
	env_t *env = xmalloc(sizeof(env_t));
	log_options_t logopt = LOG_OPTS_STDERR_ONLY;
	bool got_alloc = false;
	slurm_step_io_fds_t cio_fds = SLURM_STEP_IO_FDS_INITIALIZER;
	slurm_step_launch_callbacks_t step_callbacks;

	env->stepid = -1;
	env->procid = -1;
	env->localid = -1;
	env->nodeid = -1;
	env->cli = NULL;
	env->env = NULL;
	env->ckpt_dir = NULL;

	slurm_conf_init(NULL);
	debug_level = _slurm_debug_env_val();
	logopt.stderr_level += debug_level;
	log_init(xbasename(av[0]), logopt, 0, NULL);
	_set_exit_code();

	if (slurm_select_init(1) != SLURM_SUCCESS )
		fatal( "failed to initialize node selection plugin" );

	if (switch_init() != SLURM_SUCCESS )
		fatal("failed to initialize switch plugin");

	init_srun(ac, av, &logopt, debug_level, 1);
	create_srun_job(&job, &got_alloc, 0, 1);

	/*
	 *  Enhance environment for job
	 */
	if (opt.bcast_flag)
		_file_bcast();
	if (opt.cpus_set)
		env->cpus_per_task = opt.cpus_per_task;
	if (opt.ntasks_per_node != NO_VAL)
		env->ntasks_per_node = opt.ntasks_per_node;
	if (opt.ntasks_per_socket != NO_VAL)
		env->ntasks_per_socket = opt.ntasks_per_socket;
	if (opt.ntasks_per_core != NO_VAL)
		env->ntasks_per_core = opt.ntasks_per_core;
	env->distribution = opt.distribution;
	if (opt.plane_size != NO_VAL)
		env->plane_size = opt.plane_size;
	env->cpu_bind_type = opt.cpu_bind_type;
	env->cpu_bind = opt.cpu_bind;

	env->cpu_freq_min = opt.cpu_freq_min;
	env->cpu_freq_max = opt.cpu_freq_max;
	env->cpu_freq_gov = opt.cpu_freq_gov;
	env->mem_bind_type = opt.mem_bind_type;
	env->mem_bind = opt.mem_bind;
	env->overcommit = opt.overcommit;
	env->slurmd_debug = opt.slurmd_debug;
	env->labelio = opt.labelio;
	env->comm_port = slurmctld_comm_addr.port;
	env->batch_flag = 0;
	if (opt.job_name)
		env->job_name = opt.job_name;
	if (job) {
		uint16_t *tasks = NULL;
		slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS,
				   &tasks);

		env->select_jobinfo = job->select_jobinfo;
		env->nodelist = job->nodelist;
		env->partition = job->partition;
		/* If we didn't get the allocation don't overwrite the
		 * previous info.
		 */
		if (got_alloc)
			env->nhosts = job->nhosts;
		env->ntasks = job->ntasks;
		env->task_count = _uint16_array_to_str(job->nhosts, tasks);
		env->jobid = job->jobid;
		env->stepid = job->stepid;
		env->account = job->account;
		env->qos = job->qos;
		env->resv_name = job->resv_name;
	}
	if (opt.pty && (set_winsize(job) < 0)) {
		error("Not using a pseudo-terminal, disregarding --pty option");
		opt.pty = false;
	}
	if (opt.pty) {
		struct termios term;
		int fd = STDIN_FILENO;

		/* Save terminal settings for restore */
		tcgetattr(fd, &termdefaults);
		tcgetattr(fd, &term);
		/* Set raw mode on local tty */
		cfmakeraw(&term);
		/* Re-enable output processing such that debug() and
		 * and error() work properly. */
		term.c_oflag |= OPOST;
		tcsetattr(fd, TCSANOW, &term);
		atexit(&_pty_restore);

		block_sigwinch();
		pty_thread_create(job);
		env->pty_port = job->pty_port;
		env->ws_col   = job->ws_col;
		env->ws_row   = job->ws_row;
	}
	setup_env(env, opt.preserve_env);
	xfree(env->task_count);
	xfree(env);
	_set_node_alias();

	memset(&step_callbacks, 0, sizeof(step_callbacks));
	step_callbacks.step_signal   = launch_g_fwd_signal;

	/* re_launch: */
relaunch:
	pre_launch_srun_job(job, 0, 1);

	launch_common_set_stdio_fds(job, &cio_fds);

	if (!launch_g_step_launch(job, &cio_fds, &global_rc, &step_callbacks)) {
		if (launch_g_step_wait(job, got_alloc) == -1)
			goto relaunch;
	}

	fini_srun(job, got_alloc, &global_rc, 0);

	return (int)global_rc;
}
コード例 #3
0
ファイル: srun.c プロジェクト: supermanue/slurm
static void _launch_app(srun_job_t *job, List srun_job_list, bool got_alloc)
{
	ListIterator opt_iter, job_iter;
	opt_t *opt_local = NULL;
	_launch_app_data_t *opts;
	int total_ntasks = 0, total_nnodes = 0, step_cnt = 0, node_offset = 0;
	pthread_mutex_t step_mutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_cond_t step_cond   = PTHREAD_COND_INITIALIZER;
	srun_job_t *first_job = NULL;
	char *launch_type, *pack_node_list = NULL;
	bool need_mpir = false;
	uint16_t *tmp_task_cnt = NULL, *pack_task_cnts = NULL;
	uint32_t **tmp_tids = NULL, **pack_tids = NULL;

	launch_type = slurm_get_launch_type();
	if (launch_type && strstr(launch_type, "slurm"))
		need_mpir = true;
	xfree(launch_type);

	if (srun_job_list) {
		int pack_step_cnt = list_count(srun_job_list);
		first_job = (srun_job_t *) list_peek(srun_job_list);
		if (!opt_list) {
			if (first_job)
				fini_srun(first_job, got_alloc, &global_rc, 0);
			fatal("%s: have srun_job_list, but no opt_list",
			      __func__);
		}

		job_iter = list_iterator_create(srun_job_list);
		while ((job = (srun_job_t *) list_next(job_iter))) {
			char *node_list = NULL;
			int i, node_inx;
			total_ntasks += job->ntasks;
			total_nnodes += job->nhosts;

			xrealloc(pack_task_cnts, sizeof(uint16_t)*total_nnodes);
			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_TASKS,
						  &tmp_task_cnt);
			if (!tmp_task_cnt) {
				fatal("%s: job %u has NULL task array",
				      __func__, job->jobid);
				break;	/* To eliminate CLANG error */
			}
			memcpy(pack_task_cnts + node_offset, tmp_task_cnt,
			       sizeof(uint16_t) * job->nhosts);

			xrealloc(pack_tids, sizeof(uint32_t *) * total_nnodes);
			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_TIDS,
						  &tmp_tids);
			if (!tmp_tids) {
				fatal("%s: job %u has NULL task ID array",
				      __func__, job->jobid);
				break;	/* To eliminate CLANG error */
			}
			for (node_inx = 0; node_inx < job->nhosts; node_inx++) {
				uint32_t *node_tids;
				node_tids = xmalloc(sizeof(uint32_t) *
						    tmp_task_cnt[node_inx]);
				for (i = 0; i < tmp_task_cnt[node_inx]; i++) {
					node_tids[i] = tmp_tids[node_inx][i] +
						       job->pack_task_offset;
				}
				pack_tids[node_offset + node_inx] =
					node_tids;
			}

			(void) slurm_step_ctx_get(job->step_ctx,
						  SLURM_STEP_CTX_NODE_LIST,
						  &node_list);
			if (!node_list) {
				fatal("%s: job %u has NULL hostname",
				      __func__, job->jobid);
			}
			if (pack_node_list)
				xstrfmtcat(pack_node_list, ",%s", node_list);
			else
				pack_node_list = xstrdup(node_list);
			xfree(node_list);
			node_offset += job->nhosts;
		}
		list_iterator_reset(job_iter);
		_reorder_pack_recs(&pack_node_list, &pack_task_cnts,
				   &pack_tids, total_nnodes);

		if (need_mpir)
			mpir_init(total_ntasks);

		opt_iter = list_iterator_create(opt_list);
		while ((opt_local = (opt_t *) list_next(opt_iter))) {
			job = (srun_job_t *) list_next(job_iter);
			if (!job) {
				slurm_mutex_lock(&step_mutex);
				while (step_cnt > 0)
					slurm_cond_wait(&step_cond,&step_mutex);
				slurm_mutex_unlock(&step_mutex);
				if (first_job) {
					fini_srun(first_job, got_alloc,
						  &global_rc, 0);
				}
				fatal("%s: job allocation count does not match request count (%d != %d)",
				      __func__, list_count(srun_job_list),
				      list_count(opt_list));
				break;	/* To eliminate CLANG error */
			}

			slurm_mutex_lock(&step_mutex);
			step_cnt++;
			slurm_mutex_unlock(&step_mutex);
			job->pack_node_list = xstrdup(pack_node_list);
			if ((pack_step_cnt > 1) && pack_task_cnts) {
				xassert(node_offset == job->pack_nnodes);
				job->pack_task_cnts = xmalloc(sizeof(uint16_t) *
							      job->pack_nnodes);
				memcpy(job->pack_task_cnts, pack_task_cnts,
				       sizeof(uint16_t) * job->pack_nnodes);
				job->pack_tids = xmalloc(sizeof(uint32_t *) *
							 job->pack_nnodes);
				memcpy(job->pack_tids, pack_tids,
				       sizeof(uint32_t *) * job->pack_nnodes);
			}
			opts = xmalloc(sizeof(_launch_app_data_t));
			opts->got_alloc   = got_alloc;
			opts->job         = job;
			opts->opt_local   = opt_local;
			opts->step_cond   = &step_cond;
			opts->step_cnt    = &step_cnt;
			opts->step_mutex  = &step_mutex;
			opt_local->pack_step_cnt = pack_step_cnt;

			slurm_thread_create_detached(NULL, _launch_one_app,
						     opts);
		}
		xfree(pack_node_list);
		xfree(pack_task_cnts);
		list_iterator_destroy(job_iter);
		list_iterator_destroy(opt_iter);
		slurm_mutex_lock(&step_mutex);
		while (step_cnt > 0)
			slurm_cond_wait(&step_cond, &step_mutex);
		slurm_mutex_unlock(&step_mutex);

		if (first_job)
			fini_srun(first_job, got_alloc, &global_rc, 0);
	} else {
		if (need_mpir)
			mpir_init(job->ntasks);
		opts = xmalloc(sizeof(_launch_app_data_t));
		opts->got_alloc   = got_alloc;
		opts->job         = job;
		opts->opt_local   = &opt;
		opt.pack_step_cnt = 1;
		_launch_one_app(opts);
		fini_srun(job, got_alloc, &global_rc, 0);
	}
}