コード例 #1
0
ファイル: scancel.c プロジェクト: adammoody/slurm
static void *
_cancel_step_id (void *ci)
{
	int error_code = SLURM_SUCCESS, i;
	job_cancel_info_t *cancel_info = (job_cancel_info_t *)ci;
	uint32_t job_id  = cancel_info->job_id;
	uint32_t step_id = cancel_info->step_id;
	bool sig_set = true;
	DEF_TIMERS;

	if (cancel_info->sig == (uint16_t) NO_VAL) {
		cancel_info->sig = SIGKILL;
		sig_set = false;
	}

	if (!cancel_info->job_id_str) {
		if (cancel_info->array_job_id &&
		    (cancel_info->array_task_id == INFINITE)) {
			xstrfmtcat(cancel_info->job_id_str, "%u_*",
				   cancel_info->array_job_id);
		} else if (cancel_info->array_job_id) {
			xstrfmtcat(cancel_info->job_id_str, "%u_%u",
				   cancel_info->array_job_id,
				   cancel_info->array_task_id);
		} else {
			xstrfmtcat(cancel_info->job_id_str, "%u",
				   cancel_info->job_id);
		}
	}

	for (i = 0; i < MAX_CANCEL_RETRY; i++) {
		if (cancel_info->sig == SIGKILL) {
			verbose("Terminating step %s.%u",
				cancel_info->job_id_str, step_id);
		} else {
			verbose("Signal %u to step %s.%u",
				cancel_info->sig,
				cancel_info->job_id_str, step_id);
		}

		_add_delay();
		START_TIMER;
		if ((!sig_set) || opt.ctld)
			error_code = slurm_kill_job_step(job_id, step_id,
							 cancel_info->sig);
		else if (cancel_info->sig == SIGKILL)
			error_code = slurm_terminate_job_step(job_id, step_id);
		else
			error_code = slurm_signal_job_step(job_id, step_id,
							   cancel_info->sig);
		END_TIMER;
		slurm_mutex_lock(&max_delay_lock);
		max_resp_time = MAX(max_resp_time, DELTA_TIMER);
		slurm_mutex_unlock(&max_delay_lock);

		if ((error_code == 0) ||
		    ((errno != ESLURM_TRANSITION_STATE_NO_UPDATE) &&
		     (errno != ESLURM_JOB_PENDING)))
			break;
		verbose("Job is in transistional state, retrying");
		sleep(5 + i);
	}
	if (error_code) {
		error_code = slurm_get_errno();
		if ((opt.verbose > 0) || (error_code != ESLURM_ALREADY_DONE))
			error("Kill job error on job step id %s: %s",
		 	      cancel_info->job_id_str,
			      slurm_strerror(slurm_get_errno()));

		if ((error_code == ESLURM_ALREADY_DONE) &&
		    (cancel_info->sig == SIGKILL)) {
			error_code = 0;	/* Ignore error if job done */
		}
	}

	/* Purposely free the struct passed in here, so the caller doesn't have
	 * to keep track of it, but don't destroy the mutex and condition
	 * variables contained. */
	slurm_mutex_lock(cancel_info->num_active_threads_lock);
	*(cancel_info->rc) = MAX(*(cancel_info->rc), error_code);
	(*(cancel_info->num_active_threads))--;
	slurm_cond_signal(cancel_info->num_active_threads_cond);
	slurm_mutex_unlock(cancel_info->num_active_threads_lock);

	xfree(cancel_info->job_id_str);
	xfree(cancel_info);
	return NULL;
}
コード例 #2
0
ファイル: task_cray.c プロジェクト: BYUHPC/slurm
/*
 * Check the status file for the exit of the given local task id
 * and terminate the job step if an improper exit is found
 */
static int _check_status_file(stepd_step_rec_t *job)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;
	stepd_step_task_info_t *task;
	char *reason;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		CRAY_ERR("open(%s) failed: %m", llifile);
		return SLURM_ERROR;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		CRAY_ERR("lseek failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0 && !terminated) {
		task = job->task[job->envtp->localid];
		if (task->killed_by_cmd) {
			// We've been killed by request. User already knows
			return SLURM_SUCCESS;
		} else if (task->aborted) {
			reason = "aborted";
		} else if (WIFSIGNALED(task->estatus)) {
			reason = "signaled";
		} else {
			reason = "exited";
		}

		// Cancel the job step, since we didn't find the exiting msg
		error("Terminating job step %"PRIu32".%"PRIu32
			"; task %d exit code %d %s without notification",
			job->jobid, job->stepid, task->gtid,
			WEXITSTATUS(task->estatus), reason);
		terminated = 1;
		slurm_terminate_job_step(job->jobid, job->stepid);
	}
	return SLURM_SUCCESS;
}
コード例 #3
-1
ファイル: task_cray.c プロジェクト: alejluther/slurm
/*
 * task_term() is called after termination of application task.
 *	It is preceded by --task-epilog (from srun command line)
 *	followed by TaskEpilog program (from slurm.conf).
 */
extern int task_p_post_term (stepd_step_rec_t *job,
			     stepd_step_task_info_t *task)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;

	debug("task_p_post_term: %u.%u, task %d",
	      job->jobid, job->stepid, job->envtp->procid);

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		error("%s: open(%s) failed: %m", __func__, llifile);
		return SLURM_ERROR;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		error("%s: read failed: %m", __func__);
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset (job->envtp->localid + 1)
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		error("%s: lseek failed: %m", __func__);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		error("%s: read failed: %m", __func__);
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0) {
		// Cancel the job step, since we didn't find the exiting msg
		fprintf(stderr, "Terminating job step, task %d improper exit\n",
			job->envtp->procid);
		slurm_terminate_job_step(job->jobid, job->stepid);
	}

#endif
	return SLURM_SUCCESS;
}