Exemplo n.º 1
0
static void *_step_fini(void *args)
{
	struct step_record *step_ptr = (struct step_record *)args;
	select_jobinfo_t *jobinfo = NULL;
	nhc_info_t nhc_info;

	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK
	};
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };


	if (!step_ptr) {
		error("_step_fini: no step ptr given, "
		      "this should never happen");
		return NULL;
	}

	memset(&nhc_info, 0, sizeof(nhc_info_t));
	nhc_info.step = 1;
	lock_slurmctld(job_read_lock);
	nhc_info.jobid = step_ptr->job_ptr->job_id;
	nhc_info.apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id,
				      step_ptr->step_id);
	nhc_info.exit_code = step_ptr->exit_code;
	nhc_info.user_id = step_ptr->job_ptr->user_id;

	if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) {
		if (step_ptr->job_ptr)
			nhc_info.nodelist = xstrdup(step_ptr->job_ptr->nodes);
	} else
		nhc_info.nodelist = xstrdup(step_ptr->step_layout->node_list);
	unlock_slurmctld(job_read_lock);

	/* run NHC */
	_run_nhc(&nhc_info);
	/***********/

	xfree(nhc_info.nodelist);

	lock_slurmctld(job_write_lock);
	if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) {
		error("For some reason we don't have a step_node_bitmap or "
		      "a job_ptr for %"PRIu64".  This should never happen.",
		      nhc_info.apid);
	} else {
		other_step_finish(step_ptr);

		jobinfo = step_ptr->select_jobinfo->data;
		jobinfo->cleaning = 0;

		/* free resources on the job */
		post_job_step(step_ptr);
	}
	unlock_slurmctld(job_write_lock);

	return NULL;
}
Exemplo n.º 2
0
/*
 * task_p_pre_launch() is called prior to exec of application task.
 *	It is followed by TaskProlog program (from slurm.conf) and
 *	--task-prolog (from srun command line).
 */
extern int task_p_pre_launch (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	int rc;
	uint64_t apid;
	DEF_TIMERS;

	START_TIMER;
	apid = SLURM_ID_HASH(job->jobid, job->stepid);
	debug2("task_p_pre_launch: %u.%u, apid %"PRIu64", task %d",
	       job->jobid, job->stepid, apid, job->envtp->procid);

	/*
	 * Send the rank to the application's PMI layer via an environment
	 * variable.
	 */
	rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV,
				     "%d", job->envtp->procid);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV);
		return SLURM_ERROR;
	}

	/*
	 * Set the PMI_NO_FORK environment variable.
	 */
	rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1");
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV);
		return SLURM_ERROR;
	}

	/*
	 *  Notify the task which offset to use
	 */
	rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV,
				     "%d", job->envtp->localid + 1);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s",
			 LLI_STATUS_OFFS_ENV);
		return SLURM_ERROR;
	}

	/*
	 * Set the ALPS_APP_ID environment variable for use by
	 * Cray tools.
	 */
	rc = env_array_overwrite_fmt(&job->env, ALPS_APP_ID_ENV, "%"PRIu64,
				     apid);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s",
			 ALPS_APP_ID_ENV);
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);
#endif
	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
static void *_step_fini(void *args)
{
	struct step_record *step_ptr = (struct step_record *)args;
	select_jobinfo_t *jobinfo = NULL;
	uint64_t apid = 0;
	char *node_list = NULL;

	/* Locks: Write job, write node */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK
	};
	slurmctld_lock_t job_read_lock = {
		NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK };


	if (!step_ptr) {
		error("_step_fini: no step ptr given, "
		      "this should never happen");
		return NULL;
	}

	lock_slurmctld(job_read_lock);
	apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id);

	if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) {
		if (step_ptr->job_ptr)
			node_list = xstrdup(step_ptr->job_ptr->nodes);
	} else
		node_list = xstrdup(step_ptr->step_layout->node_list);
	unlock_slurmctld(job_read_lock);

	/* run NHC */
	_run_nhc(apid, node_list, 0);
	/***********/

	xfree(node_list);

	lock_slurmctld(job_write_lock);
	if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) {
		error("For some reason we don't have a step_node_bitmap or "
		      "a job_ptr for %"PRIu64".  This should never happen.",
		      apid);
	} else {
		other_step_finish(step_ptr);

		jobinfo = step_ptr->select_jobinfo->data;
		jobinfo->cleaning = 0;

		/* free resources on the job */
		post_job_step(step_ptr);
	}
	unlock_slurmctld(job_write_lock);

	return NULL;
}
Exemplo n.º 4
0
/*
 * Initialize an alpsc_ev_app_t
 */
static void _initialize_event(alpsc_ev_app_t *event,
			      struct job_record *job_ptr,
			      struct step_record *step_ptr,
			      alpsc_ev_app_state_e state)
{
	hostlist_t hl;
	hostlist_iterator_t hlit;
	char *node;
	int rv;

	event->apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id);
	event->uid = job_ptr->user_id;
	event->app_name = xstrdup(step_ptr->name);
	event->batch_id = xmalloc(20);	// More than enough to hold max uint32
	snprintf(event->batch_id, 20, "%"PRIu32, job_ptr->job_id);
	event->state = state;
	event->nodes = NULL;
	event->num_nodes = 0;

	// Fill in nodes and num_nodes
	if (step_ptr->step_layout) {
		hl = hostlist_create(step_ptr->step_layout->node_list);
		if (hl == NULL) {
			return;
		}
		hlit = hostlist_iterator_create(hl);
		if (hlit == NULL) {
			hostlist_destroy(hl);
			return;
		}

		event->nodes = xmalloc(step_ptr->step_layout->node_cnt
				       * sizeof(int32_t));

		while ((node = hostlist_next(hlit)) != NULL) {
			rv = sscanf(node, "nid%"SCNd32,
				    &event->nodes[event->num_nodes]);
			if (rv) {
				event->num_nodes++;
			} else {
				debug("%s: couldn't parse node %s, skipping",
				      __func__, node);
			}
			free(node);
		}

		hostlist_iterator_destroy(hlit);
		hostlist_destroy(hl);
	} else {
		// TODO: do we have to worry about batch scripts?
	}
	return;
}
Exemplo n.º 5
0
/*
 * task_p_pre_launch_priv() is called prior to exec of application task.
 * in privileged mode, just after slurm_spank_task_init_privileged
 */
extern int task_p_pre_launch_priv (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rv, fd;

	debug("task_p_pre_launch_priv: %u.%u",
	      job->jobid, job->stepid);

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Make the file
	errno = 0;
	fd = open(llifile, O_CREAT|O_EXCL|O_WRONLY, 0644);
	if (fd == -1) {
		// Another task_p_pre_launch_priv already created it, ignore
		if (errno == EEXIST) {
			return SLURM_SUCCESS;
		}
		error("%s: creat(%s) failed: %m", __func__, llifile);
		return SLURM_ERROR;
	}

	// Resize it to job->node_tasks + 1
	rv = ftruncate(fd, job->node_tasks + 1);
	if (rv == -1) {
		error("%s: ftruncate(%s) failed: %m", __func__, llifile);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Change owner/group so app can write to it
	rv = fchown(fd, job->uid, job->gid);
	if (rv == -1) {
		error("%s: chown(%s) failed: %m", __func__, llifile);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}
	info("Created file %s", llifile);

	TEMP_FAILURE_RETRY(close(fd));
#endif
	return SLURM_SUCCESS;
}
Exemplo n.º 6
0
/* NOTE: This function is called after slurmstepd spawns all user tasks.
 * Since the slurmstepd was placed in the job container when the container
 * was created and all of it's spawned tasks are placed into the container
 * when forked, all we need to do is remove the slurmstepd from the container
 * (once) at this time. */
int proctrack_p_add(stepd_step_rec_t *job, pid_t pid)
{
#ifdef HAVE_NATIVE_CRAY
	char fname[64];
	int fd;
#endif
	DEF_TIMERS;
	START_TIMER;

	// Attach to the job container
	if (job_attachpid(pid, job->cont_id) == (jid_t) -1) {
		error("Failed to attach pid %d to job container: %m", pid);
		return SLURM_ERROR;
	}

	_end_container_thread();

#ifdef HAVE_NATIVE_CRAY
	// Set apid for this pid
	if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) {
		error("Failed to set pid %d apid: %m", pid);
		return SLURM_ERROR;
	}

	// Explicitly mark pid as an application (/proc/<pid>/task_is_app)
	snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid);
	fd = open(fname, O_WRONLY);
	if (fd == -1) {
		error("Failed to open %s: %m", fname);
		return SLURM_ERROR;
	}
	if (write(fd, "1", 1) < 1) {
		error("Failed to write to %s: %m", fname);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}
	TEMP_FAILURE_RETRY(close(fd));
#endif
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);

	return SLURM_SUCCESS;
}
/*
 * If it wasn't created already, make the LLI_STATUS_FILE with given owner
 * and group, permissions 644, with given size
 */
static int _make_status_file(stepd_step_rec_t *job)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rv, fd;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Make the file
	errno = 0;
	fd = open(llifile, O_CREAT|O_EXCL|O_WRONLY, 0644);
	if (fd == -1) {
		// Another task_p_pre_launch_priv already created it, ignore
		if (errno == EEXIST) {
			return SLURM_SUCCESS;
		}
		CRAY_ERR("creat(%s) failed: %m", llifile);
		return SLURM_ERROR;
	}

	// Resize it
	rv = ftruncate(fd, job->node_tasks + 1);
	if (rv == -1) {
		CRAY_ERR("ftruncate(%s) failed: %m", llifile);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Change owner/group so app can write to it
	rv = fchown(fd, job->uid, job->gid);
	if (rv == -1) {
		CRAY_ERR("chown(%s) failed: %m", llifile);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}
	info("Created file %s", llifile);

	TEMP_FAILURE_RETRY(close(fd));
	return SLURM_SUCCESS;
}
/*
 * Check the status file for the exit of the given local task id
 * and terminate the job step if an improper exit is found
 */
static int _check_status_file(stepd_step_rec_t *job,
			      stepd_step_task_info_t *task)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;

	debug("task_p_post_term: %u.%u, task %d",
	      job->jobid, job->stepid, job->envtp->procid);

	// We only need to special case termination with exit(0)
	// srun already handles abnormal exit conditions fine
	if (!WIFEXITED(task->estatus) || (WEXITSTATUS(task->estatus) != 0))
		return SLURM_SUCCESS;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		// There's a timing issue for large jobs; this file could
		// already be cleaned up by the time we get here.
		// However, this is during a normal cleanup so no big deal.
		debug("open(%s) failed: %m", llifile);
		return SLURM_SUCCESS;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		CRAY_ERR("lseek failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0) {
		if (task->killed_by_cmd) {
			// We've been killed by request. User already knows
			return SLURM_SUCCESS;
		}

		verbose("step %u.%u task %u exited without calling "
			"PMI_Finalize()",
			job->jobid, job->stepid, task->gtid);
	}
	return SLURM_SUCCESS;
}
/*
 * task_p_post_step() is called after termination of the step
 * (all the tasks)
 */
extern int task_p_post_step (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rc, cnt;
	char *err_msg = NULL, path[PATH_MAX];
	int32_t *numa_nodes;
	cpu_set_t *cpuMasks;

	if (track_status) {
		// Get the lli file name
		snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
			 SLURM_ID_HASH(job->jobid, job->stepid));

		// Unlink the file
		errno = 0;
		rc = unlink(llifile);
		if (rc == -1 && errno != ENOENT) {
			CRAY_ERR("unlink(%s) failed: %m", llifile);
		} else if (rc == 0) {
			info("Unlinked %s", llifile);
		}
	}

	/*
	 * Compact Memory
	 *
	 * Determine which NUMA nodes and CPUS an application is using.  It will
	 * be used to compact the memory.
	 *
	 * You'll find the information in the following location.
	 * For a normal job step:
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/
	 *
	 * For a batch job step (only on the head node and only for batch jobs):
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/
	 *
	 * NUMA node: mems
	 * CPU Masks: cpus
	 */
	if (job->batch) {
		// Batch Job Step
		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_batch", job->uid, job->jobid);
		if (rc < 0) {
			CRAY_ERR("snprintf failed. Return code: %d", rc);
			return SLURM_ERROR;
		}
	} else {
		// Normal Job Step

		/* Only run epilogue on non-batch steps */
		_step_epilogue();

		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_%" PRIu32,
			      job->uid, job->jobid, job->stepid);
		if (rc < 0) {
			CRAY_ERR("snprintf failed. Return code: %d", rc);
			return SLURM_ERROR;
		}
	}

	rc = _get_numa_nodes(path, &cnt, &numa_nodes);
	if (rc < 0) {
		CRAY_ERR("get_numa_nodes failed. Return code: %d", rc);
		return SLURM_ERROR;
	}

	rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks);
	if (rc < 0) {
		CRAY_ERR("get_cpu_masks failed. Return code: %d", rc);
		return SLURM_ERROR;
	}

	/*
	 * Compact Memory
	 * The last argument which is a path to the cpuset directory has to be
	 * NULL because the CPUSET directory has already been cleaned up.
	 */
	rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL);
	_ALPSC_DEBUG("alpsc_compact_mem");

	xfree(numa_nodes);
	xfree(cpuMasks);

	if (rc != 1) {
		return SLURM_ERROR;
	}
#endif
	return SLURM_SUCCESS;
}
Exemplo n.º 10
0
/*
 * task_p_post_step() is called after termination of the step
 * (all the tasks)
 */
extern int task_p_post_step (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rc, cnt;
	char *err_msg = NULL, path[PATH_MAX];
	int32_t *numa_nodes;
	cpu_set_t *cpuMasks;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Unlink the file
	errno = 0;
	rc = unlink(llifile);
	if (rc == -1 && errno != ENOENT) {
		error("%s: unlink(%s) failed: %m", __func__, llifile);
	} else if (rc == 0) {
		info("Unlinked %s", llifile);
	}

	/*
	 * Compact Memory
	 *
	 * Determine which NUMA nodes and CPUS an application is using.  It will
	 * be used to compact the memory.
	 *
	 * You'll find the information in the following location.
	 * For a normal job step:
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/
	 *
	 * For a batch job step (only on the head node and only for batch jobs):
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/
	 *
	 * NUMA node: mems
	 * CPU Masks: cpus
	 */


	if ((job->stepid == NO_VAL) || (job->stepid == SLURM_BATCH_SCRIPT)) {
		// Batch Job Step
		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_batch", job->uid, job->jobid);
		if (rc < 0) {
			error("(%s: %d: %s) snprintf failed. Return code: %d",
			      THIS_FILE, __LINE__, __FUNCTION__, rc);
			return SLURM_ERROR;
		}
	} else {
		// Normal Job Step
		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_%" PRIu32,
			      job->uid, job->jobid, job->stepid);
		if (rc < 0) {
			error("(%s: %d: %s) snprintf failed. Return code: %d",
			      THIS_FILE, __LINE__, __FUNCTION__, rc);
			return SLURM_ERROR;
		}
	}

	rc = _get_numa_nodes(path, &cnt, &numa_nodes);
	if (rc < 0) {
		error("(%s: %d: %s) get_numa_nodes failed. Return code: %d",
		      THIS_FILE, __LINE__, __FUNCTION__, rc);
		return SLURM_ERROR;
	}

	rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks);
	if (rc < 0) {
		error("(%s: %d: %s) get_cpu_masks failed. Return code: %d",
		      THIS_FILE, __LINE__, __FUNCTION__, rc);
		return SLURM_ERROR;
	}

	/*
	 * Compact Memory
	 * The last argument which is a path to the cpuset directory has to be
	 * NULL because the CPUSET directory has already been cleaned up.
	 */
	rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL);

	xfree(numa_nodes);
	xfree(cpuMasks);

	if (rc != 1) {
		if (err_msg) {
			error("(%s: %d: %s) alpsc_compact_mem failed: %s",
			      THIS_FILE, __LINE__, __FUNCTION__, err_msg);
			free(err_msg);
		} else {
			error("(%s: %d: %s) alpsc_compact_mem failed:"
			      " No error message present.",
			      THIS_FILE, __LINE__, __FUNCTION__);
		}
		return SLURM_ERROR;
	}
	if (err_msg) {
		info("(%s: %d: %s) alpsc_compact_mem: %s", THIS_FILE, __LINE__,
		     __FUNCTION__, err_msg);
		free(err_msg);
	}
#endif
	return SLURM_SUCCESS;
}
Exemplo n.º 11
0
/*
 * Check the status file for the exit of the given local task id
 * and terminate the job step if an improper exit is found
 */
static int _check_status_file(stepd_step_rec_t *job)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;
	stepd_step_task_info_t *task;
	char *reason;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		CRAY_ERR("open(%s) failed: %m", llifile);
		return SLURM_ERROR;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		CRAY_ERR("lseek failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0 && !terminated) {
		task = job->task[job->envtp->localid];
		if (task->killed_by_cmd) {
			// We've been killed by request. User already knows
			return SLURM_SUCCESS;
		} else if (task->aborted) {
			reason = "aborted";
		} else if (WIFSIGNALED(task->estatus)) {
			reason = "signaled";
		} else {
			reason = "exited";
		}

		// Cancel the job step, since we didn't find the exiting msg
		error("Terminating job step %"PRIu32".%"PRIu32
			"; task %d exit code %d %s without notification",
			job->jobid, job->stepid, task->gtid,
			WEXITSTATUS(task->estatus), reason);
		terminated = 1;
		slurm_terminate_job_step(job->jobid, job->stepid);
	}
	return SLURM_SUCCESS;
}
Exemplo n.º 12
0
/*
 * For starting apps, push to the app list. For ending apps, removes from the
 * app list. For suspend/resume apps, edits the app list. Always adds to the
 * event list.
 */
static void _update_app(struct job_record *job_ptr,
			struct step_record *step_ptr,
			alpsc_ev_app_state_e state)
{
	uint64_t apid;
	int32_t i;
	alpsc_ev_app_t app;
	int found;

	// If aeld thread isn't running, do nothing
	if (aeld_running == 0) {
		return;
	}

	// Fill in the new event
	_initialize_event(&app, job_ptr, step_ptr, state);

	pthread_mutex_lock(&aeld_mutex);

	// Add it to the event list, only if aeld is up
	if (aeld_running == 2) {
		_add_to_app_list(&event_list, &event_list_size,
				 &event_list_capacity, &app);
	}

	// Now deal with the app list
	// Maintain app list even if aeld is down, so we have it ready when
	// it comes up.
	switch(state) {
	case ALPSC_EV_START:
		// This is new, add to the app list
		_add_to_app_list(&app_list, &app_list_size,
				 &app_list_capacity, &app);
		break;
	case ALPSC_EV_END:
		// Search for the app matching this apid
		found = 0;
		apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id);
		for (i = 0; i < app_list_size; i++) {
			if (app_list[i].apid == apid) {
				found = 1;

				// Free allocated info
				_free_event(&app_list[i]);

				// Copy last list entry to this spot
				if (i < app_list_size - 1) {
					memcpy(&app_list[i],
					       &app_list[app_list_size - 1],
					       sizeof(alpsc_ev_app_t));
				}

				app_list_size--;
				break;
			}
		}

		// Not found
		if (!found) {
			debug("Application %"PRIu64" not found in app list",
			      apid);
		}
		break;
	case ALPSC_EV_SUSPEND:
	case ALPSC_EV_RESUME:
		// Search for the app matching this apid
		apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id);
		for (i = 0; i < app_list_size; i++) {
			if (app_list[i].apid == apid) {
				// Found it, update the state
				app_list[i].state =
					(state == ALPSC_EV_SUSPEND) ?
					ALPSC_EV_SUSPEND : ALPSC_EV_START;
				break;
			}
		}

		// Not found
		if (i >= app_list_size) {
			debug("Application %"PRIu64" not found in app list",
			      apid);
		}
		break;
	default:
		break;
	}

	pthread_mutex_unlock(&aeld_mutex);

	_free_event(&app);
	return;
}
Exemplo n.º 13
0
/* NOTE: This function is called after slurmstepd spawns all user tasks.
 * Since the slurmstepd was placed in the job container when the container
 * was created and all of it's spawned tasks are placed into the container
 * when forked, all we need to do is remove the slurmstepd from the container
 * (once) at this time. */
int proctrack_p_add(stepd_step_rec_t *job, pid_t pid)
{
#ifdef HAVE_NATIVE_CRAY
	char fname[64];
	int fd;
#endif
	int count = 0;

	DEF_TIMERS;
	START_TIMER;

try_again:
	// Attach to the job container
	if (job_attachpid(pid, job->cont_id) == (jid_t) -1) {
		if (errno == EINVAL && (count < 1)) {
			jid_t jid;
			if (proctrack_p_has_pid(job->cont_id, pid)) {
				debug("%s: Trying to add pid (%d) again to the same container, ignoring.",
				      __func__, pid);
				return SLURM_SUCCESS;
			}

			if ((jid = job_detachpid(pid)) != (jid_t) -1) {
				error("%s: Pid %d was attached to container %"PRIu64" incorrectly.  Moving to correct (%"PRIu64").",
				      __func__, pid, jid, job->cont_id);
				count++;
				goto try_again;
			} else {
				error("%s: Couldn't detach pid %d from container: %m",
				      __func__, pid);
				return SLURM_ERROR;
			}
		} else {
			error("Failed to attach pid %d to job container: %m",
			      pid);
			return SLURM_ERROR;
		}
	}
	_end_container_thread();

#ifdef HAVE_NATIVE_CRAY
	// Set apid for this pid
	if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) {
		error("Failed to set pid %d apid: %m", pid);
		return SLURM_ERROR;
	}

	// Explicitly mark pid as an application (/proc/<pid>/task_is_app)
	snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid);
	fd = open(fname, O_WRONLY);
	if (fd == -1) {
		error("Failed to open %s: %m", fname);
		return SLURM_ERROR;
	}
	if (write(fd, "1", 1) < 1) {
		error("Failed to write to %s: %m", fname);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}
	TEMP_FAILURE_RETRY(close(fd));
#endif
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);

	return SLURM_SUCCESS;
}
Exemplo n.º 14
0
/*
 * Parse an MPMD file and determine count and layout of each task for use
 * with Cray systems. Builds the mpmd_set structure in the job record.
 *
 * IN/OUT job - job step details, builds mpmd_set structure
 * IN gtid - Array of global task IDs, indexed by node_id and task
 */
extern void multi_prog_parse(stepd_step_rec_t *job, uint32_t **gtid)
{
	int i, j, line_num = 0, rank_id, total_ranks = 0;
	char *line = NULL, *local_data = NULL;
	char *end_ptr = NULL, *save_ptr = NULL, *tmp_str = NULL;
	char *rank_spec = NULL, *cmd_spec = NULL, *args_spec = NULL;
	char *p = NULL;
	char **tmp_args, **tmp_cmd, *one_rank;
	uint32_t *ranks_node_id = NULL;	/* Node ID for each rank */
	uint32_t *node_id2nid = NULL;	/* Map Slurm node ID to Cray NID name */
	bool last_line_break = false, line_break = false;
	char *last_rank_spec = NULL;
	int args_len, line_len;
	hostlist_t hl;

	tmp_args = xmalloc(sizeof(char *) * job->ntasks);
	tmp_cmd = xmalloc(sizeof(char *) * job->ntasks);
	node_id2nid = xmalloc(sizeof(uint32_t) * job->nnodes);
	ranks_node_id = xmalloc(sizeof(uint32_t) * job->ntasks);
	local_data = xstrdup(job->argv[1]);
	while (1) {
		if (line_num)
			line = strtok_r(NULL, "\n", &save_ptr);
		else
			line = strtok_r(local_data, "\n", &save_ptr);
		if (!line)
			break;
		line_num++;
		line_len = strlen(line);
		if ((line_len > 0) && (line[line_len - 1] == '\\'))
			line_break = true;
		else
			line_break = false;
		if (last_line_break && last_rank_spec) {
			xstrfmtcat(tmp_str, "[%s]", last_rank_spec);
			hl = hostlist_create(tmp_str);
			xfree(tmp_str);
			if (!hl)
				goto fail;

			while ((one_rank = hostlist_pop(hl))) {
				rank_id = strtol(one_rank, &end_ptr, 10);
				if ((end_ptr[0] != '\0') || (rank_id < 0) ||
				    (rank_id >= job->ntasks)) {
					free(one_rank);
					hostlist_destroy(hl);
					goto fail;
				}
				free(one_rank);
				args_len = strlen(tmp_args[rank_id]);
				if (!tmp_args[rank_id] ||
				    tmp_args[rank_id][args_len - 1] != '\\') {
					hostlist_destroy(hl);
					goto fail;
				}
				tmp_args[rank_id][args_len -1] = '\0';
				xstrcat(tmp_args[rank_id], line);
			}
			hostlist_destroy(hl);
			last_line_break = line_break;
			continue;
		}
		last_line_break = line_break;

		p = line;
		while ((*p != '\0') && isspace(*p)) /* remove leading spaces */
			p++;
		if (*p == '#')	/* only whole-line comments handled */
			continue;
		if (*p == '\0') /* blank line ignored */
			continue;

		rank_spec = p;	/* Rank specification for this line */
		while ((*p != '\0') && !isspace(*p))
			p++;
		if (*p == '\0')
			goto fail;
		*p++ = '\0';

		while ((*p != '\0') && isspace(*p)) /* remove leading spaces */
			p++;
		if (*p == '\0') /* blank line ignored */
			continue;

		cmd_spec = p;	/* command only */
		while ((*p != '\0') && !isspace(*p))
			p++;
		if (isspace(*p))
			*p++ = '\0';

		while ((*p != '\0') && isspace(*p)) /* remove leading spaces */
			p++;
		if (*p == '\0')
			args_spec = NULL;	/* no arguments */
		else
			args_spec = p;		/* arguments string */

		xstrfmtcat(tmp_str, "[%s]", rank_spec);
		hl = hostlist_create(tmp_str);
		xfree(tmp_str);
		if (!hl)
			goto fail;
		while ((one_rank = hostlist_pop(hl))) {
			rank_id = strtol(one_rank, &end_ptr, 10);
			if ((end_ptr[0] != '\0') || (rank_id < 0) ||
			    (rank_id >= job->ntasks)) {
				free(one_rank);
				hostlist_destroy(hl);
				goto fail;
			}
			free(one_rank);
			if (tmp_args[rank_id])	/* duplicate record for rank */
				xfree(tmp_args[rank_id]);
			if (tmp_cmd[rank_id])	/* duplicate record for rank */
				xfree(tmp_cmd[rank_id]);
			else
				total_ranks++;
			tmp_args[rank_id] = xstrdup(args_spec);
			tmp_cmd[rank_id] = xstrdup(cmd_spec);
		}
		hostlist_destroy(hl);
		if (line_break)
			last_rank_spec = rank_spec;
	}
	if (total_ranks != job->ntasks)
		goto fail;

	if (job->msg->complete_nodelist &&
	    ((hl = hostlist_create(job->msg->complete_nodelist)))) {
		i = 0;
		while ((one_rank = hostlist_shift(hl))) {
			if (i >= job->nnodes) {
				error("MPMD more nodes in nodelist than count "
				      "(cnt:%u nodelist:%s)", job->nnodes,
				      job->msg->complete_nodelist);
			}
			for (j = 0; one_rank[j] && !isdigit(one_rank[j]); j++)
				;
			node_id2nid[i++] = strtol(one_rank + j, &end_ptr, 10);
			free(one_rank);
		}
		hostlist_destroy(hl);
	}

	for (i = 0; i < job->nnodes; i++) {
		if (!job->task_cnts) {
			error("MPMD job->task_cnts is NULL");
			break;
		}
		if (!job->task_cnts[i]) {
			error("MPMD job->task_cnts[%d] is NULL", i);
			break;
		}
		if (!gtid) {
			error("MPMD gtid is NULL");
			break;
		}
		if (!gtid[i]) {
			error("MPMD gtid[%d] is NULL", i);
			break;
		}
		for (j = 0; j < job->task_cnts[i]; j++) {
			if (gtid[i][j] >= job->ntasks) {
				error("MPMD gtid[%d][%d] is invalid (%u >= %u)",
				      i, j, gtid[i][j], job->ntasks);
				break;
			}
			ranks_node_id[gtid[i][j]] = i;
		}
	}

	job->mpmd_set = xmalloc(sizeof(mpmd_set_t));
	job->mpmd_set->apid      = SLURM_ID_HASH(job->jobid, job->stepid);
	job->mpmd_set->args      = xmalloc(sizeof(char *) * job->ntasks);
	job->mpmd_set->command   = xmalloc(sizeof(char *) * job->ntasks);
	job->mpmd_set->first_pe  = xmalloc(sizeof(int) * job->ntasks);
	job->mpmd_set->start_pe  = xmalloc(sizeof(int) * job->ntasks);
	job->mpmd_set->total_pe  = xmalloc(sizeof(int) * job->ntasks);
	job->mpmd_set->placement = xmalloc(sizeof(int) * job->ntasks);
	for (i = 0, j = 0; i < job->ntasks; i++) {
		job->mpmd_set->placement[i] = node_id2nid[ranks_node_id[i]];
		if (i == 0) {
			job->mpmd_set->num_cmds++;
			if (ranks_node_id[i] == job->nodeid)
				job->mpmd_set->first_pe[j] = i;
			else
				job->mpmd_set->first_pe[j] = -1;
			job->mpmd_set->args[j] = xstrdup(tmp_args[i]);
			job->mpmd_set->command[j] = xstrdup(tmp_cmd[i]);
			job->mpmd_set->start_pe[j] = i;
			job->mpmd_set->total_pe[j]++;
		} else if (!xstrcmp(tmp_cmd[i-1],  tmp_cmd[i]) &&
			   !xstrcmp(tmp_args[i-1], tmp_args[i]) &&
			   !xstrchr(tmp_args[i-1], '%')) {
			if ((ranks_node_id[i] == job->nodeid) &&
			    (job->mpmd_set->first_pe[j] == -1))
				job->mpmd_set->first_pe[j] = i;
			job->mpmd_set->total_pe[j]++;
		} else {
			j++;
			if (ranks_node_id[i] == job->nodeid)
				job->mpmd_set->first_pe[j] = i;
			else
				job->mpmd_set->first_pe[j] = -1;
			job->mpmd_set->num_cmds++;
			job->mpmd_set->args[j] = xstrdup(tmp_args[i]);
			job->mpmd_set->command[j] = xstrdup(tmp_cmd[i]);
			job->mpmd_set->start_pe[j] = i;
			job->mpmd_set->total_pe[j]++;
		}
	}
#if _DEBUG
	info("MPMD Apid:%"PRIu64"", job->mpmd_set->apid);
	info("MPMD NumPEs:%u", job->ntasks);		/* Total rank count */
	info("MPMD NumPEsHere:%u", job->node_tasks);	/* Node's rank count */
	info("MPMD NumCmds:%d", job->mpmd_set->num_cmds);
	for (i = 0; i < job->mpmd_set->num_cmds; i++) {
		info("MPMD Cmd:%s Args:%s FirstPE:%d StartPE:%d TotalPEs:%d ",
		     job->mpmd_set->command[i],  job->mpmd_set->args[i],
		     job->mpmd_set->first_pe[i], job->mpmd_set->start_pe[i],
		     job->mpmd_set->total_pe[i]);
	}
	for (i = 0; i < job->ntasks; i++) {
		info("MPMD Placement[%d]:nid%5.5d",
		     i, job->mpmd_set->placement[i]);
	}
#endif

fini:	for (i = 0; i < job->ntasks; i++) {
		xfree(tmp_args[i]);
		xfree(tmp_cmd[i]);
	}
	xfree(tmp_args);
	xfree(tmp_cmd);
	xfree(local_data);
	xfree(node_id2nid);
	xfree(ranks_node_id);
	return;

fail:	error("Invalid MPMD configuration line %d", line_num);
	goto fini;
}
Exemplo n.º 15
-1
/*
 * task_term() is called after termination of application task.
 *	It is preceded by --task-epilog (from srun command line)
 *	followed by TaskEpilog program (from slurm.conf).
 */
extern int task_p_post_term (stepd_step_rec_t *job,
			     stepd_step_task_info_t *task)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;

	debug("task_p_post_term: %u.%u, task %d",
	      job->jobid, job->stepid, job->envtp->procid);

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		error("%s: open(%s) failed: %m", __func__, llifile);
		return SLURM_ERROR;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		error("%s: read failed: %m", __func__);
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset (job->envtp->localid + 1)
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		error("%s: lseek failed: %m", __func__);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		error("%s: read failed: %m", __func__);
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0) {
		// Cancel the job step, since we didn't find the exiting msg
		fprintf(stderr, "Terminating job step, task %d improper exit\n",
			job->envtp->procid);
		slurm_terminate_job_step(job->jobid, job->stepid);
	}

#endif
	return SLURM_SUCCESS;
}