static void _task_finish(task_exit_msg_t *msg) { char *tasks; char *hosts; uint32_t rc = 0; int normal_exit = 0; const char *task_str = _taskstr(msg->num_tasks); verbose("Received task exit notification for %d %s (status=0x%04x).", msg->num_tasks, task_str, msg->return_code); tasks = _task_array_to_string(msg->num_tasks, msg->task_id_list); hosts = _task_ids_to_host_list(msg->num_tasks, msg->task_id_list); if (WIFEXITED(msg->return_code)) { if ((rc = WEXITSTATUS(msg->return_code)) == 0) { verbose("%s: %s %s: Completed", hosts, task_str, tasks); normal_exit = 1; } else if (_is_openmpi_port_error(rc)) { _handle_openmpi_port_error(tasks, hosts, local_srun_job->step_ctx); } else { error("%s: %s %s: Exited with exit code %d", hosts, task_str, tasks, rc); } if (!WIFEXITED(*local_global_rc) || (rc > WEXITSTATUS(*local_global_rc))) *local_global_rc = msg->return_code; } else if (WIFSIGNALED(msg->return_code)) { const char *signal_str = strsignal(WTERMSIG(msg->return_code)); char * core_str = ""; #ifdef WCOREDUMP if (WCOREDUMP(msg->return_code)) core_str = " (core dumped)"; #endif if (local_srun_job->state >= SRUN_JOB_CANCELLED) { verbose("%s: %s %s: %s%s", hosts, task_str, tasks, signal_str, core_str); } else { rc = msg->return_code; error("%s: %s %s: %s%s", hosts, task_str, tasks, signal_str, core_str); } if (*local_global_rc == 0) *local_global_rc = msg->return_code; } xfree(tasks); xfree(hosts); _update_task_exit_state(msg->num_tasks, msg->task_id_list, !normal_exit); if (task_state_first_abnormal_exit(task_state) && _kill_on_bad_exit()) launch_p_step_terminate(); if (task_state_first_exit(task_state) && (opt.max_wait > 0)) _setup_max_wait_timer(); }
static void _task_finish(task_exit_msg_t *msg) { char *tasks = NULL, *hosts = NULL; bool build_task_string = false; uint32_t rc = 0; int normal_exit = 0; static int reduce_task_exit_msg = -1; static int msg_printed = 0, last_task_exit_rc; const char *task_str = _taskstr(msg->num_tasks); if (reduce_task_exit_msg == -1) { char *ptr = getenv("SLURM_SRUN_REDUCE_TASK_EXIT_MSG"); if (ptr && atoi(ptr) != 0) reduce_task_exit_msg = 1; else reduce_task_exit_msg = 0; } verbose("Received task exit notification for %d %s (status=0x%04x).", msg->num_tasks, task_str, msg->return_code); /* Only build the "tasks" and "hosts" strings as needed. Buidling them * can take multiple milliseconds */ if (WIFEXITED(msg->return_code)) { if ((rc = WEXITSTATUS(msg->return_code)) == 0) { if (get_log_level() >= LOG_LEVEL_VERBOSE) build_task_string = true; } else { build_task_string = true; } } else if (WIFSIGNALED(msg->return_code)) { if (local_srun_job->state >= SRUN_JOB_CANCELLED) { if (get_log_level() >= LOG_LEVEL_VERBOSE) build_task_string = true; } else { build_task_string = true; } } if (build_task_string) { tasks = _task_array_to_string(msg->num_tasks, msg->task_id_list); hosts = _task_ids_to_host_list(msg->num_tasks, msg->task_id_list); } slurm_mutex_lock(&launch_lock); if (WIFEXITED(msg->return_code)) { if ((rc = WEXITSTATUS(msg->return_code)) == 0) { verbose("%s: %s %s: Completed", hosts, task_str, tasks); normal_exit = 1; } else if (_is_openmpi_port_error(rc)) { _handle_openmpi_port_error(tasks, hosts, local_srun_job->step_ctx); } else if ((reduce_task_exit_msg == 0) || (msg_printed == 0) || (msg->return_code != last_task_exit_rc)) { error("%s: %s %s: Exited with exit code %d", hosts, task_str, tasks, rc); msg_printed = 1; } if (!WIFEXITED(*local_global_rc) || (rc > WEXITSTATUS(*local_global_rc))) *local_global_rc = msg->return_code; } else if (WIFSIGNALED(msg->return_code)) { const char *signal_str = strsignal(WTERMSIG(msg->return_code)); char * core_str = ""; #ifdef WCOREDUMP if (WCOREDUMP(msg->return_code)) core_str = " (core dumped)"; #endif if (local_srun_job->state >= SRUN_JOB_CANCELLED) { verbose("%s: %s %s: %s%s", hosts, task_str, tasks, signal_str, core_str); } else if ((reduce_task_exit_msg == 0) || (msg_printed == 0) || (msg->return_code != last_task_exit_rc)) { error("%s: %s %s: %s%s", hosts, task_str, tasks, signal_str, core_str); msg_printed = 1; } if (*local_global_rc == NO_VAL) *local_global_rc = msg->return_code; } xfree(tasks); xfree(hosts); _update_task_exit_state(msg->num_tasks, msg->task_id_list, !normal_exit); if (task_state_first_abnormal_exit(task_state) && _kill_on_bad_exit()) launch_p_step_terminate(); if (task_state_first_exit(task_state) && (opt.max_wait > 0)) _setup_max_wait_timer(); last_task_exit_rc = msg->return_code; slurm_mutex_unlock(&launch_lock); }