Exemple #1
0
static void
_setup_exec_srun(spawn_req_t *req)
{
	char **env, env_key[32];
	int i, rc;
	spawn_resp_t *resp;

	debug3("mpi/pmi2: in _setup_exec_srun");

	/* setup environments */
	env = env_array_copy((const char **)job_info.job_env);
	/* TODO: unset some env-vars */

	env_array_overwrite_fmt(&env, "SLURM_JOB_ID", "%u", job_info.jobid);
	env_array_overwrite_fmt(&env, PMI2_SPAWNER_JOBID_ENV, "%s",
				job_info.pmi_jobid);
	env_array_overwrite_fmt(&env, PMI2_PMI_JOBID_ENV, "%s-%u",
				job_info.pmi_jobid, req->seq);
	env_array_overwrite_fmt(&env, PMI2_SPAWN_SEQ_ENV, "%u", req->seq);
	env_array_overwrite_fmt(&env, PMI2_SPAWNER_PORT_ENV, "%hu",
				tree_info.pmi_port);
	/* preput kvs */
	env_array_overwrite_fmt(&env, PMI2_PREPUT_CNT_ENV, "%d",
				req->preput_cnt);
	for (i = 0; i < req->preput_cnt; i ++) {
		snprintf(env_key, 32, PMI2_PPKEY_ENV"%d", i);
		env_array_overwrite_fmt(&env, env_key, "%s", req->pp_keys[i]);
		snprintf(env_key, 32, PMI2_PPVAL_ENV"%d", i);
		env_array_overwrite_fmt(&env, env_key, "%s", req->pp_vals[i]);
	}

	if (req->subcmd_cnt == 1) {
		/* no return if success */
		rc = _exec_srun_single(req, env);
	} else {
		/* no return if success */
		rc = _exec_srun_multiple(req, env);
	}

	resp = spawn_resp_new();
	resp->seq = req->seq;
	xstrfmtcat(resp->jobid, "%s-%u", job_info.pmi_jobid, req->seq);
	resp->error_cnt = 0;
	resp->rc = rc;

	/* fake a srun address */
	tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t));
	slurm_set_addr(tree_info.srun_addr, tree_info.pmi_port,
		       "127.0.0.1");
	spawn_resp_send_to_srun(resp);
	spawn_resp_free(resp);
	exit(errno);
}
Exemple #2
0
/*
 *  Current process is running as the user when this is called.
 */
void
exec_task(slurmd_job_t *job, int i)
{
	uint32_t *gtids;		/* pointer to arrary of ranks */
	int fd, j;
	slurmd_task_info_t *task = job->task[i];
	char **tmp_env;

	if (i == 0)
		_make_tmpdir(job);

	gtids = xmalloc(job->node_tasks * sizeof(uint32_t));
	for (j = 0; j < job->node_tasks; j++)
		gtids[j] = job->task[j]->gtid;
	job->envtp->sgtids = _uint32_array_to_str(job->node_tasks, gtids);
	xfree(gtids);

	job->envtp->jobid = job->jobid;
	job->envtp->stepid = job->stepid;
	job->envtp->nodeid = job->nodeid;
	job->envtp->cpus_on_node = job->cpus;
	job->envtp->procid = task->gtid;
	job->envtp->localid = task->id;
	job->envtp->task_pid = getpid();
	job->envtp->distribution = job->task_dist;
	job->envtp->cpu_bind = xstrdup(job->cpu_bind);
	job->envtp->cpu_bind_type = job->cpu_bind_type;
	job->envtp->cpu_freq = job->cpu_freq;
	job->envtp->mem_bind = xstrdup(job->mem_bind);
	job->envtp->mem_bind_type = job->mem_bind_type;
	job->envtp->distribution = -1;
	job->envtp->ckpt_dir = xstrdup(job->ckpt_dir);
	job->envtp->batch_flag = job->batch;

	/* Modify copy of job's environment. Do not alter in place or
	 * concurrent searches of the environment can generate invalid memory
	 * references. */
	job->envtp->env = env_array_copy((const char **) job->env);
	setup_env(job->envtp, false);
	setenvf(&job->envtp->env, "SLURMD_NODENAME", "%s", conf->node_name);
	tmp_env = job->env;
	job->env = job->envtp->env;
	env_array_free(tmp_env);
	job->envtp->env = NULL;

	xfree(job->envtp->task_count);

	if (task->argv[0] && *task->argv[0] != '/') {
		/*
		 * Normally the client (srun) expands the command name
		 * to a fully qualified path, but in --multi-prog mode it
		 * is left up to the server to search the PATH for the
		 * executable.
		 */
		task->argv[0] = _build_path(task->argv[0], job->env);
	}

	if (!job->batch) {
		if (interconnect_attach(job->switch_job, &job->env,
				job->nodeid, (uint32_t) i, job->nnodes,
				job->ntasks, task->gtid) < 0) {
			error("Unable to attach to interconnect: %m");
			log_fini();
			exit(1);
		}

		if (_setup_mpi(job, i) != SLURM_SUCCESS) {
			error("Unable to configure MPI plugin: %m");
			log_fini();
			exit(1);
		}
	}

	/* task-specific pre-launch activities */

	if (spank_user_task (job, i) < 0) {
		error ("Failed to invoke task plugin stack");
		exit (1);
	}

	/* task plugin hook */
	if (pre_launch(job)) {
		error ("Failed task affinity setup");
		exit (1);
	}

	if (conf->task_prolog) {
		char *my_prolog;
		slurm_mutex_lock(&conf->config_mutex);
		my_prolog = xstrdup(conf->task_prolog);
		slurm_mutex_unlock(&conf->config_mutex);
		_run_script_and_set_env("slurm task_prolog",
					my_prolog, job);
		xfree(my_prolog);
	}
	if (job->task_prolog) {
		_run_script_and_set_env("user task_prolog",
					job->task_prolog, job);
	}

	if (!job->batch)
		pdebug_stop_current(job);
	if (job->env == NULL) {
		debug("job->env is NULL");
		job->env = (char **)xmalloc(sizeof(char *));
		job->env[0] = (char *)NULL;
	}

	if (job->restart_dir) {
		info("restart from %s", job->restart_dir);
		/* no return on success */
		checkpoint_restart_task(job, job->restart_dir, task->gtid);
		error("Restart task failed: %m");
		exit(errno);
	}

	if (task->argv[0] == NULL) {
		error("No executable program specified for this task");
		exit(2);
	}

	/* Do this last so you don't worry too much about the users
	   limits including the slurmstepd in with it.
	*/
	if (set_user_limits(job) < 0) {
		debug("Unable to set user limits");
		log_fini();
		exit(5);
	}

	execve(task->argv[0], task->argv, job->env);

	/*
	 * print error message and clean up if execve() returns:
	 */
	if ((errno == ENOENT) &&
	    ((fd = open(task->argv[0], O_RDONLY)) >= 0)) {
		char buf[256], *eol;
		int sz;
		sz = read(fd, buf, sizeof(buf));
		if ((sz >= 3) && (strncmp(buf, "#!", 2) == 0)) {
			eol = strchr(buf, '\n');
			if (eol)
				eol[0] = '\0';
			else
				buf[sizeof(buf)-1] = '\0';
			error("execve(): bad interpreter(%s): %m", buf+2);
			exit(errno);
		}
	}
	error("execve(): %s: %m", task->argv[0]);
	exit(errno);
}
Exemple #3
0
Fichier : task.c Projet : mej/slurm
/*
 *  Current process is running as the user when this is called.
 */
extern void exec_task(stepd_step_rec_t *job, int local_proc_id)
{
	uint32_t *gtids;		/* pointer to array of ranks */
	int fd, j;
	stepd_step_task_info_t *task = job->task[local_proc_id];
	char **tmp_env;
	int saved_errno;
	uint32_t node_offset = 0, task_offset = 0;

	if (job->node_offset != NO_VAL)
		node_offset = job->node_offset;
	if (job->pack_task_offset != NO_VAL)
		task_offset = job->pack_task_offset;

	gtids = xmalloc(job->node_tasks * sizeof(uint32_t));
	for (j = 0; j < job->node_tasks; j++)
		gtids[j] = job->task[j]->gtid + task_offset;
	job->envtp->sgtids = _uint32_array_to_str(job->node_tasks, gtids);
	xfree(gtids);

	if (job->pack_jobid != NO_VAL)
		job->envtp->jobid = job->pack_jobid;
	else
		job->envtp->jobid = job->jobid;
	job->envtp->stepid = job->stepid;
	job->envtp->nodeid = job->nodeid + node_offset;
	job->envtp->cpus_on_node = job->cpus;
	job->envtp->procid = task->gtid + task_offset;
	job->envtp->localid = task->id;
	job->envtp->task_pid = getpid();
	job->envtp->distribution = job->task_dist;
	job->envtp->cpu_bind = xstrdup(job->cpu_bind);
	job->envtp->cpu_bind_type = job->cpu_bind_type;
	job->envtp->cpu_freq_min = job->cpu_freq_min;
	job->envtp->cpu_freq_max = job->cpu_freq_max;
	job->envtp->cpu_freq_gov = job->cpu_freq_gov;
	job->envtp->mem_bind = xstrdup(job->mem_bind);
	job->envtp->mem_bind_type = job->mem_bind_type;
	job->envtp->distribution = -1;
	job->envtp->ckpt_dir = xstrdup(job->ckpt_dir);
	job->envtp->batch_flag = job->batch;
	job->envtp->uid = job->uid;
	job->envtp->user_name = xstrdup(job->user_name);

	/*
	 * Modify copy of job's environment. Do not alter in place or
	 * concurrent searches of the environment can generate invalid memory
	 * references.
	 */
	job->envtp->env = env_array_copy((const char **) job->env);
	setup_env(job->envtp, false);
	setenvf(&job->envtp->env, "SLURM_JOB_GID", "%d", job->gid);
	setenvf(&job->envtp->env, "SLURMD_NODENAME", "%s", conf->node_name);
	if (job->tres_bind) {
		setenvf(&job->envtp->env, "SLURMD_TRES_BIND", "%s",
			job->tres_bind);
	}
	if (job->tres_freq) {
		setenvf(&job->envtp->env, "SLURMD_TRES_FREQ", "%s",
			job->tres_freq);
	}
	tmp_env = job->env;
	job->env = job->envtp->env;
	env_array_free(tmp_env);
	job->envtp->env = NULL;

	xfree(job->envtp->task_count);

	if (task->argv[0] && *task->argv[0] != '/') {
		/*
		 * Normally the client (srun) expands the command name
		 * to a fully qualified path, but in --multi-prog mode it
		 * is left up to the server to search the PATH for the
		 * executable.
		 */
		task->argv[0] = _build_path(task->argv[0], job->env, NULL);
	}

	if (!job->batch && (job->stepid != SLURM_EXTERN_CONT)) {
		if (switch_g_job_attach(job->switch_job, &job->env,
					job->nodeid, (uint32_t) local_proc_id,
					job->nnodes, job->ntasks,
					task->gtid) < 0) {
			error("Unable to attach to interconnect: %m");
			log_fini();
			exit(1);
		}

		if (_setup_mpi(job, local_proc_id) != SLURM_SUCCESS) {
			error("Unable to configure MPI plugin: %m");
			log_fini();
			exit(1);
		}
	}

	/* task-specific pre-launch activities */

	/* task plugin hook */
	if (task_g_pre_launch(job)) {
		error("Failed to invoke task plugins: task_p_pre_launch error");
		exit(1);
	}
	if (!job->batch &&
	    (job->accel_bind_type || job->tres_bind || job->tres_freq)) {
		/*
		 * Modify copy of job's environment. Do not alter in place or
		 * concurrent searches of the environment can generate invalid
		 * memory references.
		 *
		 * Also sets GRES frequency as needed.
		 */
		job->envtp->env = env_array_copy((const char **) job->env);
		gres_plugin_step_set_env(&job->envtp->env, job->step_gres_list,
					 job->accel_bind_type, job->tres_bind,
					 job->tres_freq, local_proc_id);
		tmp_env = job->env;
		job->env = job->envtp->env;
		env_array_free(tmp_env);
	}

	if (spank_user_task(job, local_proc_id) < 0) {
		error("Failed to invoke spank plugin stack");
		exit(1);
	}

	if (conf->task_prolog) {
		char *my_prolog;
		slurm_mutex_lock(&conf->config_mutex);
		my_prolog = xstrdup(conf->task_prolog);
		slurm_mutex_unlock(&conf->config_mutex);
		_run_script_and_set_env("slurm task_prolog",
					my_prolog, job);
		xfree(my_prolog);
	}
	if (job->task_prolog) {
		_run_script_and_set_env("user task_prolog",
					job->task_prolog, job);
	}

	/*
	 * Set TMPDIR after running prolog scripts, since TMPDIR
	 * might be set or changed in one of the prolog scripts.
	 */
	if (local_proc_id == 0)
		_make_tmpdir(job);

	if (!job->batch)
		pdebug_stop_current(job);
	if (job->env == NULL) {
		debug("job->env is NULL");
		job->env = (char **)xmalloc(sizeof(char *));
		job->env[0] = (char *)NULL;
	}

	if (job->restart_dir) {
		info("restart from %s", job->restart_dir);
		/* no return on success */
		checkpoint_restart_task(job, job->restart_dir, task->gtid);
		error("Restart task failed: %m");
		exit(errno);
	}

	if (task->argv[0] == NULL) {
		error("No executable program specified for this task");
		exit(2);
	}

	/* Do this last so you don't worry too much about the users
	   limits including the slurmstepd in with it.
	*/
	if (set_user_limits(job) < 0) {
		debug("Unable to set user limits");
		log_fini();
		exit(5);
	}

	execve(task->argv[0], task->argv, job->env);
	saved_errno = errno;

	/*
	 * print error message and clean up if execve() returns:
	 */
	if ((errno == ENOENT) &&
	    ((fd = open(task->argv[0], O_RDONLY)) >= 0)) {
		char buf[256], *eol;
		int sz;
		sz = read(fd, buf, sizeof(buf));
		if ((sz >= 3) && (xstrncmp(buf, "#!", 2) == 0)) {
			buf[sizeof(buf)-1] = '\0';
			eol = strchr(buf, '\n');
			if (eol)
				eol[0] = '\0';
			slurm_seterrno(saved_errno);
			error("execve(): bad interpreter(%s): %m", buf+2);
			exit(errno);
		}
	}
	slurm_seterrno(saved_errno);
	error("execve(): %s: %m", task->argv[0]);
	exit(errno);
}
Exemple #4
0
static int
_setup_stepd_job_info(const stepd_step_rec_t *job, char ***env)
{
	char *p;
	int i;

	memset(&job_info, 0, sizeof(job_info));

	job_info.jobid  = job->jobid;
	job_info.stepid = job->stepid;
	job_info.nnodes = job->nnodes;
	job_info.nodeid = job->nodeid;
	job_info.ntasks = job->ntasks;
	job_info.ltasks = job->node_tasks;
	job_info.gtids = xmalloc(job->node_tasks * sizeof(uint32_t));
	for (i = 0; i < job->node_tasks; i ++) {
		job_info.gtids[i] = job->task[i]->gtid;
	}

	p = getenvp(*env, PMI2_PMI_DEBUGGED_ENV);
	if (p) {
		job_info.pmi_debugged = atoi(p);
	} else {
		job_info.pmi_debugged = 0;
	}
	p = getenvp(*env, PMI2_SPAWN_SEQ_ENV);
	if (p) { 		/* spawned */
		job_info.spawn_seq = atoi(p);
		unsetenvp(*env, PMI2_SPAWN_SEQ_ENV);
		p = getenvp(*env, PMI2_SPAWNER_JOBID_ENV);
		job_info.spawner_jobid = xstrdup(p);
		unsetenvp(*env, PMI2_SPAWNER_JOBID_ENV);
	} else {
		job_info.spawn_seq = 0;
		job_info.spawner_jobid = NULL;
	}
	p = getenvp(*env, PMI2_PMI_JOBID_ENV);
	if (p) {
		job_info.pmi_jobid = xstrdup(p);
		unsetenvp(*env, PMI2_PMI_JOBID_ENV);
	} else {
		xstrfmtcat(job_info.pmi_jobid, "%u.%u", job->jobid,
			   job->stepid);
	}
	p = getenvp(*env, PMI2_STEP_NODES_ENV);
	if (!p) {
		error("mpi/pmi2: unable to find nodes in job environment");
		return SLURM_ERROR;
	} else {
		job_info.step_nodelist = xstrdup(p);
		unsetenvp(*env, PMI2_STEP_NODES_ENV);
	}
	/*
	 * how to get the mapping info from stepd directly?
	 * there is the task distribution info in the launch_tasks_request_msg_t,
	 * but it is not stored in the stepd_step_rec_t.
	 */
	p = getenvp(*env, PMI2_PROC_MAPPING_ENV);
	if (!p) {
		error("PMI2_PROC_MAPPING_ENV not found");
		return SLURM_ERROR;
	} else {
		job_info.proc_mapping = xstrdup(p);
		unsetenvp(*env, PMI2_PROC_MAPPING_ENV);
	}

	job_info.job_env = env_array_copy((const char **)*env);

	job_info.MPIR_proctable = NULL;
	job_info.srun_opt = NULL;

	return SLURM_SUCCESS;
}
Exemple #5
0
static int
_setup_srun_job_info(const mpi_plugin_client_info_t *job)
{
	char *p;
	void *handle = NULL, *sym = NULL;

	memset(&job_info, 0, sizeof(job_info));

	job_info.jobid  = job->jobid;
	job_info.stepid = job->stepid;
	job_info.nnodes = job->step_layout->node_cnt;
	job_info.nodeid = -1;	/* id in tree. not used. */
	job_info.ntasks = job->step_layout->task_cnt;
	job_info.ltasks = 0;	/* not used */
	job_info.gtids = NULL;	/* not used */


	p = getenv(PMI2_PMI_DEBUGGED_ENV);
	if (p) {
		job_info.pmi_debugged = atoi(p);
	} else {
		job_info.pmi_debugged = 0;
	}
	p = getenv(PMI2_SPAWN_SEQ_ENV);
	if (p) { 		/* spawned */
		job_info.spawn_seq = atoi(p);
		p = getenv(PMI2_SPAWNER_JOBID_ENV);
		job_info.spawner_jobid = xstrdup(p);
		/* env unset in stepd */
	} else {
		job_info.spawn_seq = 0;
		job_info.spawner_jobid = NULL;
	}

	job_info.step_nodelist = xstrdup(job->step_layout->node_list);
	job_info.proc_mapping = _get_proc_mapping(job);
	if (job_info.proc_mapping == NULL) {
		return SLURM_ERROR;
	}
	p = getenv(PMI2_PMI_JOBID_ENV);
	if (p) {		/* spawned */
		job_info.pmi_jobid = xstrdup(p);
	} else {
		xstrfmtcat(job_info.pmi_jobid, "%u.%u", job->jobid,
			   job->stepid);
	}
	job_info.job_env = env_array_copy((const char **)environ);

	/* hjcao: this is really dirty.
	   But writing a new launcher is not desirable. */
	handle = dlopen(NULL, RTLD_LAZY);
	if (handle == NULL) {
		error("mpi/pmi2: failed to dlopen()");
		return SLURM_ERROR;
	}
	sym = dlsym(handle, "MPIR_proctable");
	if (sym == NULL) {
		/* if called directly in API, there may be no symbol available */
		verbose ("mpi/pmi2: failed to find symbol 'MPIR_proctable'");
		job_info.MPIR_proctable = NULL;
	} else {
		job_info.MPIR_proctable = *(MPIR_PROCDESC **)sym;
	}
	sym = dlsym(handle, "opt");
	if (sym == NULL) {
		verbose("mpi/pmi2: failed to find symbol 'opt'");
		job_info.srun_opt = NULL;
	} else {
		job_info.srun_opt = (opt_t *)sym;
	}
	dlclose(handle);

	return SLURM_SUCCESS;
}
Exemple #6
0
static void _setup_one_job_env(opt_t *opt_local, srun_job_t *job,
			       bool got_alloc)
{
	env_t *env = xmalloc(sizeof(env_t));
	uint16_t *tasks = NULL;

	xassert(job);

	env->localid = -1;
	env->nodeid  = -1;
	env->procid  = -1;
	env->stepid  = -1;

	if (opt_local->bcast_flag)
		_file_bcast(opt_local, job);
	if (opt_local->cpus_set)
		env->cpus_per_task = opt_local->cpus_per_task;
	if (opt_local->ntasks_per_node != NO_VAL)
		env->ntasks_per_node = opt_local->ntasks_per_node;
	if (opt_local->ntasks_per_socket != NO_VAL)
		env->ntasks_per_socket = opt_local->ntasks_per_socket;
	if (opt_local->ntasks_per_core != NO_VAL)
		env->ntasks_per_core = opt_local->ntasks_per_core;
	env->distribution = opt_local->distribution;
	if (opt_local->plane_size != NO_VAL)
		env->plane_size = opt_local->plane_size;
	env->cpu_bind_type = opt_local->cpu_bind_type;
	env->cpu_bind = opt_local->cpu_bind;

	env->cpu_freq_min = opt_local->cpu_freq_min;
	env->cpu_freq_max = opt_local->cpu_freq_max;
	env->cpu_freq_gov = opt_local->cpu_freq_gov;
	env->mem_bind_type = opt_local->mem_bind_type;
	env->mem_bind = opt_local->mem_bind;
	env->overcommit = opt_local->overcommit;
	env->slurmd_debug = opt_local->slurmd_debug;
	env->labelio = opt_local->labelio;
	env->comm_port = slurmctld_comm_addr.port;
	if (opt_local->job_name)
		env->job_name = opt_local->job_name;

	slurm_step_ctx_get(job->step_ctx, SLURM_STEP_CTX_TASKS, &tasks);

	env->select_jobinfo = job->select_jobinfo;
	if (job->pack_node_list)
		env->nodelist = job->pack_node_list;
	else
		env->nodelist = job->nodelist;
	env->partition = job->partition;
	/*
	 * If we didn't get the allocation don't overwrite the previous info.
	 */
	if (got_alloc)
		env->nhosts = job->nhosts;
	env->ntasks = job->ntasks;
	if (job->pack_ntasks != NO_VAL)
		env->ntasks = job->pack_ntasks;
	env->task_count = _uint16_array_to_str(job->nhosts, tasks);
	if (job->pack_jobid != NO_VAL)
		env->jobid = job->pack_jobid;
	else
		env->jobid = job->jobid;
	env->stepid = job->stepid;
	env->account = job->account;
	env->qos = job->qos;
	env->resv_name = job->resv_name;

	if (opt_local->pty && (set_winsize(job) < 0)) {
		error("Not using a pseudo-terminal, disregarding --pty option");
		opt_local->pty = false;
	}
	if (opt_local->pty) {
		struct termios term;
		int fd = STDIN_FILENO;

		/* Save terminal settings for restore */
		tcgetattr(fd, &termdefaults);
		tcgetattr(fd, &term);
		/* Set raw mode on local tty */
		cfmakeraw(&term);
		/* Re-enable output processing such that debug() and
		 * and error() work properly. */
		term.c_oflag |= OPOST;
		tcsetattr(fd, TCSANOW, &term);
		atexit(&_pty_restore);

		block_sigwinch();
		pty_thread_create(job);
		env->pty_port = job->pty_port;
		env->ws_col   = job->ws_col;
		env->ws_row   = job->ws_row;
	}

	env->env = env_array_copy((const char **) environ);
	setup_env(env, opt_local->preserve_env);
	job->env = env->env;
	xfree(env->task_count);
	xfree(env);
}