Beispiel #1
0
static void _task_finish(task_exit_msg_t *msg)
{
	char *tasks;
	char *hosts;
	uint32_t rc = 0;
	int normal_exit = 0;

	const char *task_str = _taskstr(msg->num_tasks);

	verbose("Received task exit notification for %d %s (status=0x%04x).",
		msg->num_tasks, task_str, msg->return_code);

	tasks = _task_array_to_string(msg->num_tasks, msg->task_id_list);
	hosts = _task_ids_to_host_list(msg->num_tasks, msg->task_id_list);

	if (WIFEXITED(msg->return_code)) {
		if ((rc = WEXITSTATUS(msg->return_code)) == 0) {
			verbose("%s: %s %s: Completed", hosts, task_str, tasks);
			normal_exit = 1;
		}
		else if (_is_openmpi_port_error(rc)) {
			_handle_openmpi_port_error(tasks, hosts,
						   local_srun_job->step_ctx);
		} else {
			error("%s: %s %s: Exited with exit code %d",
			      hosts, task_str, tasks, rc);
		}
		if (!WIFEXITED(*local_global_rc)
		    || (rc > WEXITSTATUS(*local_global_rc)))
			*local_global_rc = msg->return_code;
	}
	else if (WIFSIGNALED(msg->return_code)) {
		const char *signal_str = strsignal(WTERMSIG(msg->return_code));
		char * core_str = "";
#ifdef WCOREDUMP
		if (WCOREDUMP(msg->return_code))
			core_str = " (core dumped)";
#endif
		if (local_srun_job->state >= SRUN_JOB_CANCELLED) {
			verbose("%s: %s %s: %s%s",
				hosts, task_str, tasks, signal_str, core_str);
		} else {
			rc = msg->return_code;
			error("%s: %s %s: %s%s",
			      hosts, task_str, tasks, signal_str, core_str);
		}
		if (*local_global_rc == 0)
			*local_global_rc = msg->return_code;
	}

	xfree(tasks);
	xfree(hosts);

	_update_task_exit_state(msg->num_tasks, msg->task_id_list,
				!normal_exit);

	if (task_state_first_abnormal_exit(task_state)
	    && _kill_on_bad_exit())
  		launch_p_step_terminate();

	if (task_state_first_exit(task_state) && (opt.max_wait > 0))
		_setup_max_wait_timer();
}
static void _task_finish(task_exit_msg_t *msg)
{
	char *tasks = NULL, *hosts = NULL;
	bool build_task_string = false;
	uint32_t rc = 0;
	int normal_exit = 0;
	static int reduce_task_exit_msg = -1;
	static int msg_printed = 0, last_task_exit_rc;

	const char *task_str = _taskstr(msg->num_tasks);

	if (reduce_task_exit_msg == -1) {
		char *ptr = getenv("SLURM_SRUN_REDUCE_TASK_EXIT_MSG");
		if (ptr && atoi(ptr) != 0)
			reduce_task_exit_msg = 1;
		else
			reduce_task_exit_msg = 0;
	}

	verbose("Received task exit notification for %d %s (status=0x%04x).",
		msg->num_tasks, task_str, msg->return_code);

	/* Only build the "tasks" and "hosts" strings as needed. Buidling them
	 * can take multiple milliseconds */
	if (WIFEXITED(msg->return_code)) {
		if ((rc = WEXITSTATUS(msg->return_code)) == 0) {
			if (get_log_level() >= LOG_LEVEL_VERBOSE)
				build_task_string = true;
		} else {
			build_task_string = true;
		}

	} else if (WIFSIGNALED(msg->return_code)) {
		if (local_srun_job->state >= SRUN_JOB_CANCELLED) {
			if (get_log_level() >= LOG_LEVEL_VERBOSE)
				build_task_string = true;
		} else {
			build_task_string = true;
		}
	}
	if (build_task_string) {
		tasks = _task_array_to_string(msg->num_tasks,
					      msg->task_id_list);
		hosts = _task_ids_to_host_list(msg->num_tasks,
					       msg->task_id_list);
	}

	slurm_mutex_lock(&launch_lock);
	if (WIFEXITED(msg->return_code)) {
		if ((rc = WEXITSTATUS(msg->return_code)) == 0) {
			verbose("%s: %s %s: Completed", hosts, task_str, tasks);
			normal_exit = 1;
		} else if (_is_openmpi_port_error(rc)) {
			_handle_openmpi_port_error(tasks, hosts,
						   local_srun_job->step_ctx);
		} else if ((reduce_task_exit_msg == 0) ||
			   (msg_printed == 0) ||
			   (msg->return_code != last_task_exit_rc)) {
			error("%s: %s %s: Exited with exit code %d",
			      hosts, task_str, tasks, rc);
			msg_printed = 1;
		}
		if (!WIFEXITED(*local_global_rc)
		    || (rc > WEXITSTATUS(*local_global_rc)))
			*local_global_rc = msg->return_code;
	} else if (WIFSIGNALED(msg->return_code)) {
		const char *signal_str = strsignal(WTERMSIG(msg->return_code));
		char * core_str = "";
#ifdef WCOREDUMP
		if (WCOREDUMP(msg->return_code))
			core_str = " (core dumped)";
#endif
		if (local_srun_job->state >= SRUN_JOB_CANCELLED) {
			verbose("%s: %s %s: %s%s",
				hosts, task_str, tasks, signal_str, core_str);
		} else if ((reduce_task_exit_msg == 0) ||
			   (msg_printed == 0) ||
			   (msg->return_code != last_task_exit_rc)) {
			error("%s: %s %s: %s%s",
			      hosts, task_str, tasks, signal_str, core_str);
			msg_printed = 1;
		}
		if (*local_global_rc == NO_VAL)
			*local_global_rc = msg->return_code;
	}
	xfree(tasks);
	xfree(hosts);

	_update_task_exit_state(msg->num_tasks, msg->task_id_list,
				!normal_exit);

	if (task_state_first_abnormal_exit(task_state)
	    && _kill_on_bad_exit())
		launch_p_step_terminate();

	if (task_state_first_exit(task_state) && (opt.max_wait > 0))
		_setup_max_wait_timer();

	last_task_exit_rc = msg->return_code;
	slurm_mutex_unlock(&launch_lock);
}