static void *_step_fini(void *args) { struct step_record *step_ptr = (struct step_record *)args; select_jobinfo_t *jobinfo = NULL; nhc_info_t nhc_info; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; if (!step_ptr) { error("_step_fini: no step ptr given, " "this should never happen"); return NULL; } memset(&nhc_info, 0, sizeof(nhc_info_t)); nhc_info.step = 1; lock_slurmctld(job_read_lock); nhc_info.jobid = step_ptr->job_ptr->job_id; nhc_info.apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id); nhc_info.exit_code = step_ptr->exit_code; nhc_info.user_id = step_ptr->job_ptr->user_id; if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) { if (step_ptr->job_ptr) nhc_info.nodelist = xstrdup(step_ptr->job_ptr->nodes); } else nhc_info.nodelist = xstrdup(step_ptr->step_layout->node_list); unlock_slurmctld(job_read_lock); /* run NHC */ _run_nhc(&nhc_info); /***********/ xfree(nhc_info.nodelist); lock_slurmctld(job_write_lock); if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) { error("For some reason we don't have a step_node_bitmap or " "a job_ptr for %"PRIu64". This should never happen.", nhc_info.apid); } else { other_step_finish(step_ptr); jobinfo = step_ptr->select_jobinfo->data; jobinfo->cleaning = 0; /* free resources on the job */ post_job_step(step_ptr); } unlock_slurmctld(job_write_lock); return NULL; }
/* * task_p_pre_launch() is called prior to exec of application task. * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ extern int task_p_pre_launch (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY int rc; uint64_t apid; DEF_TIMERS; START_TIMER; apid = SLURM_ID_HASH(job->jobid, job->stepid); debug2("task_p_pre_launch: %u.%u, apid %"PRIu64", task %d", job->jobid, job->stepid, apid, job->envtp->procid); /* * Send the rank to the application's PMI layer via an environment * variable. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV, "%d", job->envtp->procid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV); return SLURM_ERROR; } /* * Set the PMI_NO_FORK environment variable. */ rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1"); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV); return SLURM_ERROR; } /* * Notify the task which offset to use */ rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV, "%d", job->envtp->localid + 1); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", LLI_STATUS_OFFS_ENV); return SLURM_ERROR; } /* * Set the ALPS_APP_ID environment variable for use by * Cray tools. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_ID_ENV, "%"PRIu64, apid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_ID_ENV); } END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); #endif return SLURM_SUCCESS; }
static void *_step_fini(void *args) { struct step_record *step_ptr = (struct step_record *)args; select_jobinfo_t *jobinfo = NULL; uint64_t apid = 0; char *node_list = NULL; /* Locks: Write job, write node */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; if (!step_ptr) { error("_step_fini: no step ptr given, " "this should never happen"); return NULL; } lock_slurmctld(job_read_lock); apid = SLURM_ID_HASH(step_ptr->job_ptr->job_id, step_ptr->step_id); if (!step_ptr->step_layout || !step_ptr->step_layout->node_list) { if (step_ptr->job_ptr) node_list = xstrdup(step_ptr->job_ptr->nodes); } else node_list = xstrdup(step_ptr->step_layout->node_list); unlock_slurmctld(job_read_lock); /* run NHC */ _run_nhc(apid, node_list, 0); /***********/ xfree(node_list); lock_slurmctld(job_write_lock); if (!step_ptr->job_ptr || !step_ptr->step_node_bitmap) { error("For some reason we don't have a step_node_bitmap or " "a job_ptr for %"PRIu64". This should never happen.", apid); } else { other_step_finish(step_ptr); jobinfo = step_ptr->select_jobinfo->data; jobinfo->cleaning = 0; /* free resources on the job */ post_job_step(step_ptr); } unlock_slurmctld(job_write_lock); return NULL; }
/* * Initialize an alpsc_ev_app_t */ static void _initialize_event(alpsc_ev_app_t *event, struct job_record *job_ptr, struct step_record *step_ptr, alpsc_ev_app_state_e state) { hostlist_t hl; hostlist_iterator_t hlit; char *node; int rv; event->apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id); event->uid = job_ptr->user_id; event->app_name = xstrdup(step_ptr->name); event->batch_id = xmalloc(20); // More than enough to hold max uint32 snprintf(event->batch_id, 20, "%"PRIu32, job_ptr->job_id); event->state = state; event->nodes = NULL; event->num_nodes = 0; // Fill in nodes and num_nodes if (step_ptr->step_layout) { hl = hostlist_create(step_ptr->step_layout->node_list); if (hl == NULL) { return; } hlit = hostlist_iterator_create(hl); if (hlit == NULL) { hostlist_destroy(hl); return; } event->nodes = xmalloc(step_ptr->step_layout->node_cnt * sizeof(int32_t)); while ((node = hostlist_next(hlit)) != NULL) { rv = sscanf(node, "nid%"SCNd32, &event->nodes[event->num_nodes]); if (rv) { event->num_nodes++; } else { debug("%s: couldn't parse node %s, skipping", __func__, node); } free(node); } hostlist_iterator_destroy(hlit); hostlist_destroy(hl); } else { // TODO: do we have to worry about batch scripts? } return; }
/* * task_p_pre_launch_priv() is called prior to exec of application task. * in privileged mode, just after slurm_spank_task_init_privileged */ extern int task_p_pre_launch_priv (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rv, fd; debug("task_p_pre_launch_priv: %u.%u", job->jobid, job->stepid); // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Make the file errno = 0; fd = open(llifile, O_CREAT|O_EXCL|O_WRONLY, 0644); if (fd == -1) { // Another task_p_pre_launch_priv already created it, ignore if (errno == EEXIST) { return SLURM_SUCCESS; } error("%s: creat(%s) failed: %m", __func__, llifile); return SLURM_ERROR; } // Resize it to job->node_tasks + 1 rv = ftruncate(fd, job->node_tasks + 1); if (rv == -1) { error("%s: ftruncate(%s) failed: %m", __func__, llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Change owner/group so app can write to it rv = fchown(fd, job->uid, job->gid); if (rv == -1) { error("%s: chown(%s) failed: %m", __func__, llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } info("Created file %s", llifile); TEMP_FAILURE_RETRY(close(fd)); #endif return SLURM_SUCCESS; }
/* NOTE: This function is called after slurmstepd spawns all user tasks. * Since the slurmstepd was placed in the job container when the container * was created and all of it's spawned tasks are placed into the container * when forked, all we need to do is remove the slurmstepd from the container * (once) at this time. */ int proctrack_p_add(stepd_step_rec_t *job, pid_t pid) { #ifdef HAVE_NATIVE_CRAY char fname[64]; int fd; #endif DEF_TIMERS; START_TIMER; // Attach to the job container if (job_attachpid(pid, job->cont_id) == (jid_t) -1) { error("Failed to attach pid %d to job container: %m", pid); return SLURM_ERROR; } _end_container_thread(); #ifdef HAVE_NATIVE_CRAY // Set apid for this pid if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) { error("Failed to set pid %d apid: %m", pid); return SLURM_ERROR; } // Explicitly mark pid as an application (/proc/<pid>/task_is_app) snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid); fd = open(fname, O_WRONLY); if (fd == -1) { error("Failed to open %s: %m", fname); return SLURM_ERROR; } if (write(fd, "1", 1) < 1) { error("Failed to write to %s: %m", fname); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } TEMP_FAILURE_RETRY(close(fd)); #endif END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; }
/* * If it wasn't created already, make the LLI_STATUS_FILE with given owner * and group, permissions 644, with given size */ static int _make_status_file(stepd_step_rec_t *job) { char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rv, fd; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Make the file errno = 0; fd = open(llifile, O_CREAT|O_EXCL|O_WRONLY, 0644); if (fd == -1) { // Another task_p_pre_launch_priv already created it, ignore if (errno == EEXIST) { return SLURM_SUCCESS; } CRAY_ERR("creat(%s) failed: %m", llifile); return SLURM_ERROR; } // Resize it rv = ftruncate(fd, job->node_tasks + 1); if (rv == -1) { CRAY_ERR("ftruncate(%s) failed: %m", llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Change owner/group so app can write to it rv = fchown(fd, job->uid, job->gid); if (rv == -1) { CRAY_ERR("chown(%s) failed: %m", llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } info("Created file %s", llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; }
/* * Check the status file for the exit of the given local task id * and terminate the job step if an improper exit is found */ static int _check_status_file(stepd_step_rec_t *job, stepd_step_task_info_t *task) { char llifile[LLI_STATUS_FILE_BUF_SIZE]; char status; int rv, fd; debug("task_p_post_term: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); // We only need to special case termination with exit(0) // srun already handles abnormal exit conditions fine if (!WIFEXITED(task->estatus) || (WEXITSTATUS(task->estatus) != 0)) return SLURM_SUCCESS; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Open the lli file. fd = open(llifile, O_RDONLY); if (fd == -1) { // There's a timing issue for large jobs; this file could // already be cleaned up by the time we get here. // However, this is during a normal cleanup so no big deal. debug("open(%s) failed: %m", llifile); return SLURM_SUCCESS; } // Read the first byte (indicates starting) rv = read(fd, &status, sizeof(status)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_ERROR; } // If the first byte is 0, we either aren't an MPI app or // it didn't make it past pmi_init, in any case, return success if (status == 0) { TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; } // Seek to the correct offset rv = lseek(fd, job->envtp->localid + 1, SEEK_SET); if (rv == -1) { CRAY_ERR("lseek failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Read the exiting byte rv = read(fd, &status, sizeof(status)); TEMP_FAILURE_RETRY(close(fd)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_SUCCESS; } // Check the result if (status == 0) { if (task->killed_by_cmd) { // We've been killed by request. User already knows return SLURM_SUCCESS; } verbose("step %u.%u task %u exited without calling " "PMI_Finalize()", job->jobid, job->stepid, task->gtid); } return SLURM_SUCCESS; }
/* * task_p_post_step() is called after termination of the step * (all the tasks) */ extern int task_p_post_step (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rc, cnt; char *err_msg = NULL, path[PATH_MAX]; int32_t *numa_nodes; cpu_set_t *cpuMasks; if (track_status) { // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Unlink the file errno = 0; rc = unlink(llifile); if (rc == -1 && errno != ENOENT) { CRAY_ERR("unlink(%s) failed: %m", llifile); } else if (rc == 0) { info("Unlinked %s", llifile); } } /* * Compact Memory * * Determine which NUMA nodes and CPUS an application is using. It will * be used to compact the memory. * * You'll find the information in the following location. * For a normal job step: * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/ * * For a batch job step (only on the head node and only for batch jobs): * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/ * * NUMA node: mems * CPU Masks: cpus */ if (job->batch) { // Batch Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_batch", job->uid, job->jobid); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); return SLURM_ERROR; } } else { // Normal Job Step /* Only run epilogue on non-batch steps */ _step_epilogue(); rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_%" PRIu32, job->uid, job->jobid, job->stepid); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); return SLURM_ERROR; } } rc = _get_numa_nodes(path, &cnt, &numa_nodes); if (rc < 0) { CRAY_ERR("get_numa_nodes failed. Return code: %d", rc); return SLURM_ERROR; } rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks); if (rc < 0) { CRAY_ERR("get_cpu_masks failed. Return code: %d", rc); return SLURM_ERROR; } /* * Compact Memory * The last argument which is a path to the cpuset directory has to be * NULL because the CPUSET directory has already been cleaned up. */ rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL); _ALPSC_DEBUG("alpsc_compact_mem"); xfree(numa_nodes); xfree(cpuMasks); if (rc != 1) { return SLURM_ERROR; } #endif return SLURM_SUCCESS; }
/* * task_p_post_step() is called after termination of the step * (all the tasks) */ extern int task_p_post_step (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rc, cnt; char *err_msg = NULL, path[PATH_MAX]; int32_t *numa_nodes; cpu_set_t *cpuMasks; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Unlink the file errno = 0; rc = unlink(llifile); if (rc == -1 && errno != ENOENT) { error("%s: unlink(%s) failed: %m", __func__, llifile); } else if (rc == 0) { info("Unlinked %s", llifile); } /* * Compact Memory * * Determine which NUMA nodes and CPUS an application is using. It will * be used to compact the memory. * * You'll find the information in the following location. * For a normal job step: * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/ * * For a batch job step (only on the head node and only for batch jobs): * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/ * * NUMA node: mems * CPU Masks: cpus */ if ((job->stepid == NO_VAL) || (job->stepid == SLURM_BATCH_SCRIPT)) { // Batch Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_batch", job->uid, job->jobid); if (rc < 0) { error("(%s: %d: %s) snprintf failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } } else { // Normal Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_%" PRIu32, job->uid, job->jobid, job->stepid); if (rc < 0) { error("(%s: %d: %s) snprintf failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } } rc = _get_numa_nodes(path, &cnt, &numa_nodes); if (rc < 0) { error("(%s: %d: %s) get_numa_nodes failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks); if (rc < 0) { error("(%s: %d: %s) get_cpu_masks failed. Return code: %d", THIS_FILE, __LINE__, __FUNCTION__, rc); return SLURM_ERROR; } /* * Compact Memory * The last argument which is a path to the cpuset directory has to be * NULL because the CPUSET directory has already been cleaned up. */ rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL); xfree(numa_nodes); xfree(cpuMasks); if (rc != 1) { if (err_msg) { error("(%s: %d: %s) alpsc_compact_mem failed: %s", THIS_FILE, __LINE__, __FUNCTION__, err_msg); free(err_msg); } else { error("(%s: %d: %s) alpsc_compact_mem failed:" " No error message present.", THIS_FILE, __LINE__, __FUNCTION__); } return SLURM_ERROR; } if (err_msg) { info("(%s: %d: %s) alpsc_compact_mem: %s", THIS_FILE, __LINE__, __FUNCTION__, err_msg); free(err_msg); } #endif return SLURM_SUCCESS; }
/* * Check the status file for the exit of the given local task id * and terminate the job step if an improper exit is found */ static int _check_status_file(stepd_step_rec_t *job) { char llifile[LLI_STATUS_FILE_BUF_SIZE]; char status; int rv, fd; stepd_step_task_info_t *task; char *reason; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Open the lli file. fd = open(llifile, O_RDONLY); if (fd == -1) { CRAY_ERR("open(%s) failed: %m", llifile); return SLURM_ERROR; } // Read the first byte (indicates starting) rv = read(fd, &status, sizeof(status)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_ERROR; } // If the first byte is 0, we either aren't an MPI app or // it didn't make it past pmi_init, in any case, return success if (status == 0) { TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; } // Seek to the correct offset rv = lseek(fd, job->envtp->localid + 1, SEEK_SET); if (rv == -1) { CRAY_ERR("lseek failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Read the exiting byte rv = read(fd, &status, sizeof(status)); TEMP_FAILURE_RETRY(close(fd)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_SUCCESS; } // Check the result if (status == 0 && !terminated) { task = job->task[job->envtp->localid]; if (task->killed_by_cmd) { // We've been killed by request. User already knows return SLURM_SUCCESS; } else if (task->aborted) { reason = "aborted"; } else if (WIFSIGNALED(task->estatus)) { reason = "signaled"; } else { reason = "exited"; } // Cancel the job step, since we didn't find the exiting msg error("Terminating job step %"PRIu32".%"PRIu32 "; task %d exit code %d %s without notification", job->jobid, job->stepid, task->gtid, WEXITSTATUS(task->estatus), reason); terminated = 1; slurm_terminate_job_step(job->jobid, job->stepid); } return SLURM_SUCCESS; }
/* * For starting apps, push to the app list. For ending apps, removes from the * app list. For suspend/resume apps, edits the app list. Always adds to the * event list. */ static void _update_app(struct job_record *job_ptr, struct step_record *step_ptr, alpsc_ev_app_state_e state) { uint64_t apid; int32_t i; alpsc_ev_app_t app; int found; // If aeld thread isn't running, do nothing if (aeld_running == 0) { return; } // Fill in the new event _initialize_event(&app, job_ptr, step_ptr, state); pthread_mutex_lock(&aeld_mutex); // Add it to the event list, only if aeld is up if (aeld_running == 2) { _add_to_app_list(&event_list, &event_list_size, &event_list_capacity, &app); } // Now deal with the app list // Maintain app list even if aeld is down, so we have it ready when // it comes up. switch(state) { case ALPSC_EV_START: // This is new, add to the app list _add_to_app_list(&app_list, &app_list_size, &app_list_capacity, &app); break; case ALPSC_EV_END: // Search for the app matching this apid found = 0; apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id); for (i = 0; i < app_list_size; i++) { if (app_list[i].apid == apid) { found = 1; // Free allocated info _free_event(&app_list[i]); // Copy last list entry to this spot if (i < app_list_size - 1) { memcpy(&app_list[i], &app_list[app_list_size - 1], sizeof(alpsc_ev_app_t)); } app_list_size--; break; } } // Not found if (!found) { debug("Application %"PRIu64" not found in app list", apid); } break; case ALPSC_EV_SUSPEND: case ALPSC_EV_RESUME: // Search for the app matching this apid apid = SLURM_ID_HASH(job_ptr->job_id, step_ptr->step_id); for (i = 0; i < app_list_size; i++) { if (app_list[i].apid == apid) { // Found it, update the state app_list[i].state = (state == ALPSC_EV_SUSPEND) ? ALPSC_EV_SUSPEND : ALPSC_EV_START; break; } } // Not found if (i >= app_list_size) { debug("Application %"PRIu64" not found in app list", apid); } break; default: break; } pthread_mutex_unlock(&aeld_mutex); _free_event(&app); return; }
/* NOTE: This function is called after slurmstepd spawns all user tasks. * Since the slurmstepd was placed in the job container when the container * was created and all of it's spawned tasks are placed into the container * when forked, all we need to do is remove the slurmstepd from the container * (once) at this time. */ int proctrack_p_add(stepd_step_rec_t *job, pid_t pid) { #ifdef HAVE_NATIVE_CRAY char fname[64]; int fd; #endif int count = 0; DEF_TIMERS; START_TIMER; try_again: // Attach to the job container if (job_attachpid(pid, job->cont_id) == (jid_t) -1) { if (errno == EINVAL && (count < 1)) { jid_t jid; if (proctrack_p_has_pid(job->cont_id, pid)) { debug("%s: Trying to add pid (%d) again to the same container, ignoring.", __func__, pid); return SLURM_SUCCESS; } if ((jid = job_detachpid(pid)) != (jid_t) -1) { error("%s: Pid %d was attached to container %"PRIu64" incorrectly. Moving to correct (%"PRIu64").", __func__, pid, jid, job->cont_id); count++; goto try_again; } else { error("%s: Couldn't detach pid %d from container: %m", __func__, pid); return SLURM_ERROR; } } else { error("Failed to attach pid %d to job container: %m", pid); return SLURM_ERROR; } } _end_container_thread(); #ifdef HAVE_NATIVE_CRAY // Set apid for this pid if (job_setapid(pid, SLURM_ID_HASH(job->jobid, job->stepid)) == -1) { error("Failed to set pid %d apid: %m", pid); return SLURM_ERROR; } // Explicitly mark pid as an application (/proc/<pid>/task_is_app) snprintf(fname, sizeof(fname), "/proc/%d/task_is_app", pid); fd = open(fname, O_WRONLY); if (fd == -1) { error("Failed to open %s: %m", fname); return SLURM_ERROR; } if (write(fd, "1", 1) < 1) { error("Failed to write to %s: %m", fname); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } TEMP_FAILURE_RETRY(close(fd)); #endif END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); return SLURM_SUCCESS; }
/* * Parse an MPMD file and determine count and layout of each task for use * with Cray systems. Builds the mpmd_set structure in the job record. * * IN/OUT job - job step details, builds mpmd_set structure * IN gtid - Array of global task IDs, indexed by node_id and task */ extern void multi_prog_parse(stepd_step_rec_t *job, uint32_t **gtid) { int i, j, line_num = 0, rank_id, total_ranks = 0; char *line = NULL, *local_data = NULL; char *end_ptr = NULL, *save_ptr = NULL, *tmp_str = NULL; char *rank_spec = NULL, *cmd_spec = NULL, *args_spec = NULL; char *p = NULL; char **tmp_args, **tmp_cmd, *one_rank; uint32_t *ranks_node_id = NULL; /* Node ID for each rank */ uint32_t *node_id2nid = NULL; /* Map Slurm node ID to Cray NID name */ bool last_line_break = false, line_break = false; char *last_rank_spec = NULL; int args_len, line_len; hostlist_t hl; tmp_args = xmalloc(sizeof(char *) * job->ntasks); tmp_cmd = xmalloc(sizeof(char *) * job->ntasks); node_id2nid = xmalloc(sizeof(uint32_t) * job->nnodes); ranks_node_id = xmalloc(sizeof(uint32_t) * job->ntasks); local_data = xstrdup(job->argv[1]); while (1) { if (line_num) line = strtok_r(NULL, "\n", &save_ptr); else line = strtok_r(local_data, "\n", &save_ptr); if (!line) break; line_num++; line_len = strlen(line); if ((line_len > 0) && (line[line_len - 1] == '\\')) line_break = true; else line_break = false; if (last_line_break && last_rank_spec) { xstrfmtcat(tmp_str, "[%s]", last_rank_spec); hl = hostlist_create(tmp_str); xfree(tmp_str); if (!hl) goto fail; while ((one_rank = hostlist_pop(hl))) { rank_id = strtol(one_rank, &end_ptr, 10); if ((end_ptr[0] != '\0') || (rank_id < 0) || (rank_id >= job->ntasks)) { free(one_rank); hostlist_destroy(hl); goto fail; } free(one_rank); args_len = strlen(tmp_args[rank_id]); if (!tmp_args[rank_id] || tmp_args[rank_id][args_len - 1] != '\\') { hostlist_destroy(hl); goto fail; } tmp_args[rank_id][args_len -1] = '\0'; xstrcat(tmp_args[rank_id], line); } hostlist_destroy(hl); last_line_break = line_break; continue; } last_line_break = line_break; p = line; while ((*p != '\0') && isspace(*p)) /* remove leading spaces */ p++; if (*p == '#') /* only whole-line comments handled */ continue; if (*p == '\0') /* blank line ignored */ continue; rank_spec = p; /* Rank specification for this line */ while ((*p != '\0') && !isspace(*p)) p++; if (*p == '\0') goto fail; *p++ = '\0'; while ((*p != '\0') && isspace(*p)) /* remove leading spaces */ p++; if (*p == '\0') /* blank line ignored */ continue; cmd_spec = p; /* command only */ while ((*p != '\0') && !isspace(*p)) p++; if (isspace(*p)) *p++ = '\0'; while ((*p != '\0') && isspace(*p)) /* remove leading spaces */ p++; if (*p == '\0') args_spec = NULL; /* no arguments */ else args_spec = p; /* arguments string */ xstrfmtcat(tmp_str, "[%s]", rank_spec); hl = hostlist_create(tmp_str); xfree(tmp_str); if (!hl) goto fail; while ((one_rank = hostlist_pop(hl))) { rank_id = strtol(one_rank, &end_ptr, 10); if ((end_ptr[0] != '\0') || (rank_id < 0) || (rank_id >= job->ntasks)) { free(one_rank); hostlist_destroy(hl); goto fail; } free(one_rank); if (tmp_args[rank_id]) /* duplicate record for rank */ xfree(tmp_args[rank_id]); if (tmp_cmd[rank_id]) /* duplicate record for rank */ xfree(tmp_cmd[rank_id]); else total_ranks++; tmp_args[rank_id] = xstrdup(args_spec); tmp_cmd[rank_id] = xstrdup(cmd_spec); } hostlist_destroy(hl); if (line_break) last_rank_spec = rank_spec; } if (total_ranks != job->ntasks) goto fail; if (job->msg->complete_nodelist && ((hl = hostlist_create(job->msg->complete_nodelist)))) { i = 0; while ((one_rank = hostlist_shift(hl))) { if (i >= job->nnodes) { error("MPMD more nodes in nodelist than count " "(cnt:%u nodelist:%s)", job->nnodes, job->msg->complete_nodelist); } for (j = 0; one_rank[j] && !isdigit(one_rank[j]); j++) ; node_id2nid[i++] = strtol(one_rank + j, &end_ptr, 10); free(one_rank); } hostlist_destroy(hl); } for (i = 0; i < job->nnodes; i++) { if (!job->task_cnts) { error("MPMD job->task_cnts is NULL"); break; } if (!job->task_cnts[i]) { error("MPMD job->task_cnts[%d] is NULL", i); break; } if (!gtid) { error("MPMD gtid is NULL"); break; } if (!gtid[i]) { error("MPMD gtid[%d] is NULL", i); break; } for (j = 0; j < job->task_cnts[i]; j++) { if (gtid[i][j] >= job->ntasks) { error("MPMD gtid[%d][%d] is invalid (%u >= %u)", i, j, gtid[i][j], job->ntasks); break; } ranks_node_id[gtid[i][j]] = i; } } job->mpmd_set = xmalloc(sizeof(mpmd_set_t)); job->mpmd_set->apid = SLURM_ID_HASH(job->jobid, job->stepid); job->mpmd_set->args = xmalloc(sizeof(char *) * job->ntasks); job->mpmd_set->command = xmalloc(sizeof(char *) * job->ntasks); job->mpmd_set->first_pe = xmalloc(sizeof(int) * job->ntasks); job->mpmd_set->start_pe = xmalloc(sizeof(int) * job->ntasks); job->mpmd_set->total_pe = xmalloc(sizeof(int) * job->ntasks); job->mpmd_set->placement = xmalloc(sizeof(int) * job->ntasks); for (i = 0, j = 0; i < job->ntasks; i++) { job->mpmd_set->placement[i] = node_id2nid[ranks_node_id[i]]; if (i == 0) { job->mpmd_set->num_cmds++; if (ranks_node_id[i] == job->nodeid) job->mpmd_set->first_pe[j] = i; else job->mpmd_set->first_pe[j] = -1; job->mpmd_set->args[j] = xstrdup(tmp_args[i]); job->mpmd_set->command[j] = xstrdup(tmp_cmd[i]); job->mpmd_set->start_pe[j] = i; job->mpmd_set->total_pe[j]++; } else if (!xstrcmp(tmp_cmd[i-1], tmp_cmd[i]) && !xstrcmp(tmp_args[i-1], tmp_args[i]) && !xstrchr(tmp_args[i-1], '%')) { if ((ranks_node_id[i] == job->nodeid) && (job->mpmd_set->first_pe[j] == -1)) job->mpmd_set->first_pe[j] = i; job->mpmd_set->total_pe[j]++; } else { j++; if (ranks_node_id[i] == job->nodeid) job->mpmd_set->first_pe[j] = i; else job->mpmd_set->first_pe[j] = -1; job->mpmd_set->num_cmds++; job->mpmd_set->args[j] = xstrdup(tmp_args[i]); job->mpmd_set->command[j] = xstrdup(tmp_cmd[i]); job->mpmd_set->start_pe[j] = i; job->mpmd_set->total_pe[j]++; } } #if _DEBUG info("MPMD Apid:%"PRIu64"", job->mpmd_set->apid); info("MPMD NumPEs:%u", job->ntasks); /* Total rank count */ info("MPMD NumPEsHere:%u", job->node_tasks); /* Node's rank count */ info("MPMD NumCmds:%d", job->mpmd_set->num_cmds); for (i = 0; i < job->mpmd_set->num_cmds; i++) { info("MPMD Cmd:%s Args:%s FirstPE:%d StartPE:%d TotalPEs:%d ", job->mpmd_set->command[i], job->mpmd_set->args[i], job->mpmd_set->first_pe[i], job->mpmd_set->start_pe[i], job->mpmd_set->total_pe[i]); } for (i = 0; i < job->ntasks; i++) { info("MPMD Placement[%d]:nid%5.5d", i, job->mpmd_set->placement[i]); } #endif fini: for (i = 0; i < job->ntasks; i++) { xfree(tmp_args[i]); xfree(tmp_cmd[i]); } xfree(tmp_args); xfree(tmp_cmd); xfree(local_data); xfree(node_id2nid); xfree(ranks_node_id); return; fail: error("Invalid MPMD configuration line %d", line_num); goto fini; }
/* * task_term() is called after termination of application task. * It is preceded by --task-epilog (from srun command line) * followed by TaskEpilog program (from slurm.conf). */ extern int task_p_post_term (stepd_step_rec_t *job, stepd_step_task_info_t *task) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; char status; int rv, fd; debug("task_p_post_term: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Open the lli file. fd = open(llifile, O_RDONLY); if (fd == -1) { error("%s: open(%s) failed: %m", __func__, llifile); return SLURM_ERROR; } // Read the first byte (indicates starting) rv = read(fd, &status, sizeof(status)); if (rv == -1) { error("%s: read failed: %m", __func__); return SLURM_ERROR; } // If the first byte is 0, we either aren't an MPI app or // it didn't make it past pmi_init, in any case, return success if (status == 0) { TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; } // Seek to the correct offset (job->envtp->localid + 1) rv = lseek(fd, job->envtp->localid + 1, SEEK_SET); if (rv == -1) { error("%s: lseek failed: %m", __func__); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Read the exiting byte rv = read(fd, &status, sizeof(status)); TEMP_FAILURE_RETRY(close(fd)); if (rv == -1) { error("%s: read failed: %m", __func__); return SLURM_SUCCESS; } // Check the result if (status == 0) { // Cancel the job step, since we didn't find the exiting msg fprintf(stderr, "Terminating job step, task %d improper exit\n", job->envtp->procid); slurm_terminate_job_step(job->jobid, job->stepid); } #endif return SLURM_SUCCESS; }