/* * Function: release_port * Description: * Release the port. * * Returns: * 0 on success and -1 on failure. */ int release_port(uint32_t real_port) { uint32_t port; if ((real_port < MIN_PORT) || (real_port >= MAX_PORT)) { CRAY_ERR("Port %" PRIu32 " outside of valid range %" PRIu32 " : %" PRIu32, real_port, MIN_PORT, MAX_PORT); return -1; } port = real_port - MIN_PORT; pthread_mutex_lock(&port_mutex); if (bit_test(port_resv, port)) { bit_clear(port_resv, port); pthread_mutex_unlock(&port_mutex); } else { CRAY_ERR("Attempting to release port %d," " but it was not reserved.", real_port); pthread_mutex_unlock(&port_mutex); return -1; } return 0; }
/* * Start the thread to extend cookie leases. */ extern int start_lease_extender(void) { pthread_attr_t attr_agent; pthread_t thread_agent; int retries = 0; // Start lease extender in the slurmctld if (!_in_slurmctld()) return SLURM_SUCCESS; /* spawn an agent */ slurm_attr_init(&attr_agent); if (pthread_attr_setdetachstate(&attr_agent, PTHREAD_CREATE_DETACHED)) { CRAY_ERR("pthread_attr_setdetachstate error %m"); } retries = 0; while (pthread_create(&thread_agent, &attr_agent, &_lease_extender, NULL)) { error("pthread_create error %m"); if (++retries > 1) { CRAY_ERR("Can't create pthread"); slurm_attr_destroy(&attr_agent); return SLURM_ERROR; } usleep(1000); /* sleep and retry */ } slurm_attr_destroy(&attr_agent); return SLURM_SUCCESS; }
/* * task_p_pre_launch() is called prior to exec of application task. * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ extern int task_p_pre_launch (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY int rc; uint64_t apid; DEF_TIMERS; START_TIMER; apid = SLURM_ID_HASH(job->jobid, job->stepid); debug2("task_p_pre_launch: %u.%u, apid %"PRIu64", task %d", job->jobid, job->stepid, apid, job->envtp->procid); /* * Send the rank to the application's PMI layer via an environment * variable. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV, "%d", job->envtp->procid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV); return SLURM_ERROR; } /* * Set the PMI_NO_FORK environment variable. */ rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1"); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV); return SLURM_ERROR; } /* * Notify the task which offset to use */ rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV, "%d", job->envtp->localid + 1); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", LLI_STATUS_OFFS_ENV); return SLURM_ERROR; } /* * Set the ALPS_APP_ID environment variable for use by * Cray tools. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_ID_ENV, "%"PRIu64, apid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_ID_ENV); } END_TIMER; if (debug_flags & DEBUG_FLAG_TIME_CRAY) INFO_LINE("call took: %s", TIME_STR); #endif return SLURM_SUCCESS; }
/* * Function: get_cpu_total * Description: * Get the total number of online cpus on the node. * * RETURNS * Returns the number of online cpus on the node. On error, it returns -1. * * TODO: * Danny suggests using xcgroup_get_param to read the CPU values instead of * this function. Look at the way task/cgroup/task_cgroup_cpuset.c or * jobacct_gather/cgroup/jobacct_gather_cgroup.c does it. */ static int _get_cpu_total(void) { FILE *f = NULL; char *token = NULL, *lin = NULL, *saveptr = NULL; int total = 0; ssize_t lsz; size_t sz; int matches; long int number1, number2; f = fopen("/sys/devices/system/cpu/online", "r"); if (!f) { CRAY_ERR("Failed to open file" " /sys/devices/system/cpu/online: %m"); return -1; } while (!feof(f)) { lsz = getline(&lin, &sz, f); if (lsz > 0) { // Split into comma-separated tokens token = strtok_r(lin, ",", &saveptr); while (token) { // Check each token for a range matches = sscanf(token, "%ld-%ld", &number1, &number2); if (matches <= 0) { // This token isn't numeric CRAY_ERR("Error parsing %s: %m", token); free(lin); TEMP_FAILURE_RETRY(fclose(f)); return -1; } else if (matches == 1) { // Single entry total++; } else if (number2 > number1) { // Range total += number2 - number1 + 1; } else { // Invalid range CRAY_ERR("Invalid range %s", token); free(lin); TEMP_FAILURE_RETRY(fclose(f)); return -1; } token = strtok_r(NULL, ",", &saveptr); } } } free(lin); TEMP_FAILURE_RETRY(fclose(f)); return total; }
/* * Function: assign_port * Description: * Looks for and assigns the next free port. This port is used by Cray's * PMI for its communications to manage its control tree. * * To avoid port conflicts, this function selects a large range of * ports within the middle of the port range where it assumes no * ports are used. No special precautions are taken to handle a * selected port already in use by some other non-SLURM component * on the node. * * If there are no free ports, then it loops through the entire table * ATTEMPTS number of times before declaring a failure. * * Returns: * 0 on success and -1 on failure. */ int assign_port(uint32_t *real_port) { int port, tmp, attempts = 0; if (!real_port) { CRAY_ERR("real_port address was NULL."); return -1; } /* * Ports is an index into the reserved port table. * The ports range from 0 up to PORT_CNT. */ pthread_mutex_lock(&port_mutex); port = ++last_alloc_port % PORT_CNT; /* * Find an unreserved port to assign. * Abandon the attempt if we've been through the available ports ATTEMPT * number of times */ while (bit_test(port_resv, port)) { tmp = ++port % PORT_CNT; port = tmp; attempts++; if ((attempts / PORT_CNT) >= ATTEMPTS) { CRAY_ERR("No free ports among %d ports. " "Went through entire port list %d times", PORT_CNT, ATTEMPTS); pthread_mutex_unlock(&port_mutex); return -1; } else if ((attempts % PORT_CNT) == 0) { /* * Each time through give other threads a chance * to release ports */ pthread_mutex_unlock(&port_mutex); sleep(1); pthread_mutex_lock(&port_mutex); } } bit_set(port_resv, port); last_alloc_port = port; pthread_mutex_unlock(&port_mutex); /* * The port index must be scaled up by the MIN_PORT. */ *real_port = (port + MIN_PORT); return 0; }
/* * switch functions for global state save/restore */ int switch_p_libstate_save(char *dir_name) { #ifdef HAVE_NATIVE_CRAY Buf buffer; char *file_name; int ret = SLURM_SUCCESS; int state_fd; xassert(dir_name != NULL); if (debug_flags & DEBUG_FLAG_SWITCH) CRAY_INFO("save to %s", dir_name); buffer = init_buf(SWITCH_BUF_SIZE); _state_write_buf(buffer); file_name = xstrdup(dir_name); xstrcat(file_name, "/switch_cray_state"); (void) unlink(file_name); state_fd = creat(file_name, 0600); if (state_fd < 0) { CRAY_ERR("Can't save state, error creating file %s %m", file_name); ret = SLURM_ERROR; } else { char *buf = get_buf_data(buffer); size_t len = get_buf_offset(buffer); while (1) { int wrote = write(state_fd, buf, len); if ((wrote < 0) && (errno == EINTR)) continue; if (wrote == 0) break; if (wrote < 0) { CRAY_ERR("Can't save switch state: %m"); ret = SLURM_ERROR; break; } buf += wrote; len -= wrote; } close(state_fd); } xfree(file_name); if (buffer) free_buf(buffer); return ret; #else return SLURM_SUCCESS; #endif }
/* * Determines the memory scaling amount to use. * Returns -1 on failure. */ int get_mem_scaling(stepd_step_rec_t *job) { int mem_scaling; uint32_t total_mem; /* * Get the memory amount */ total_mem = _get_mem_total(); if (total_mem == 0) { CRAY_ERR("Scanning /proc/meminfo results in MemTotal=0"); return -1; } /* * Scale total_mem, which is in kilobytes, to megabytes because * app_mem is in megabytes. * Round to the nearest integer. * If the memory request is greater than 100 percent, then scale * it to 100%. * If the memory request is zero, then return an error. * * Note: Because this has caused some confusion in the past, * The MEM_PER_CPU flag is used to indicate that job->step_mem * is the amount of memory per CPU, not total. However, this * flag is read and cleared in slurmd prior to passing this * value to slurmstepd. * The value comes to slurmstepd already properly scaled. * Thus, this function does not need to check the MEM_PER_CPU * flag. */ mem_scaling = ((((double) job->step_mem / ((double) total_mem / 1024)) * (double) 100)) + 0.5; if (mem_scaling > MAX_SCALING) { CRAY_INFO("Memory scaling out of bounds: %d. " "Reducing to %d%%.", mem_scaling, MAX_SCALING); mem_scaling = MAX_SCALING; } if (mem_scaling < MIN_SCALING) { CRAY_ERR("Memory scaling out of bounds: %d. " "Increasing to %d%%", mem_scaling, MIN_SCALING); mem_scaling = MIN_SCALING; } return mem_scaling; }
/* * Determines the cpu scaling amount to use. * Returns -1 on failure. */ int get_cpu_scaling(stepd_step_rec_t *job) { int total_cpus, num_app_cpus, cpu_scaling; /* * Get the number of CPUs on the node */ total_cpus = _get_cpu_total(); if (total_cpus <= 0) { CRAY_ERR("total_cpus <= 0: %d", total_cpus); return -1; } /* * If the submission didn't come from srun (API style) * perhaps they didn't fill in things correctly. */ if (!job->cpus_per_task) { job->cpus_per_task = 1; } /* * Determine number of CPUs requested for the step */ num_app_cpus = job->cpus; if (num_app_cpus <= 0) { num_app_cpus = job->node_tasks * job->cpus_per_task; if (num_app_cpus <= 0) { CRAY_ERR("num_app_cpus <= 0: %d", num_app_cpus); return -1; } } /* * Determine what percentage of the CPUs were requested */ cpu_scaling = (((double) num_app_cpus / (double) total_cpus) * (double) 100) + 0.5; if (cpu_scaling > MAX_SCALING) { debug("Cpu scaling out of bounds: %d. Reducing to %d%%", cpu_scaling, MAX_SCALING); cpu_scaling = MAX_SCALING; } else if (cpu_scaling < MIN_SCALING) { CRAY_ERR("Cpu scaling out of bounds: %d. Increasing to %d%%", cpu_scaling, MIN_SCALING); cpu_scaling = MIN_SCALING; } return cpu_scaling; }
/* * Search the job's environment to determine if the * user requested the MPS to be on or off. * Returns 0 for off, 1 for on, 2 for not requested, * 3 for error. */ static int _get_mps_request(stepd_step_rec_t *job) { char *envval; // Determine what user wants the mps to be set at by the // CRAY_CUDA_MPS and CRAY_CUDA_PROXY variables. If not set, // do nothing. if (!(envval = getenvp(job->env, CRAY_CUDA_MPS_ENV)) && !(envval = getenvp(job->env, CRAY_CUDA_PROXY_ENV))) { debug2("No GPU action requested"); return 2; } if (!strcasecmp(envval, "on") || !strcmp(envval, "1")) { debug2("GPU mps requested on"); return 1; } else if (!strcasecmp(envval, "off") || !strcmp(envval, "0")) { debug2("GPU mps requested off"); return 0; } CRAY_ERR("Couldn't parse %s value %s, expected on,off,0,1", CRAY_CUDA_MPS_ENV, envval); return 3; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { debug("%s loaded.", plugin_name); #ifdef HAVE_NATIVE_CRAY int rc; struct stat st; debug_flags = slurm_get_debug_flags(); // Create the run directory errno = 0; rc = mkdir(TASK_CRAY_RUN_DIR, 0755); if (rc == -1 && errno != EEXIST) { CRAY_ERR("Couldn't create %s: %m", TASK_CRAY_RUN_DIR); return SLURM_ERROR; } // Determine whether to track app status with LLI rc = stat(LLI_SPOOL_DIR, &st); if (rc == -1) { debug("stat %s failed, disabling exit status tracking: %m", LLI_SPOOL_DIR); track_status = 0; } else { track_status = 1; } #endif return SLURM_SUCCESS; }
/* * Get the total amount of memory on the node. * Returns 0 on failure. */ static uint32_t _get_mem_total(void) { FILE *f = NULL; size_t sz = 0; ssize_t lsz = 0; char *lin = NULL; int meminfo_value; char meminfo_str[1024]; uint32_t total_mem = 0; f = fopen("/proc/meminfo", "r"); if (f == NULL ) { CRAY_ERR("Failed to open /proc/meminfo: %m"); return 0; } while (!feof(f)) { lsz = getline(&lin, &sz, f); if (lsz > 0) { sscanf(lin, "%s %d", meminfo_str, &meminfo_value); if (!strcmp(meminfo_str, "MemTotal:")) { total_mem = meminfo_value; break; } } } free(lin); TEMP_FAILURE_RETRY(fclose(f)); return total_mem; }
/* * Get a peCmdMapArray, or NULL on error */ static int *_get_cmd_map(stepd_step_rec_t *job) { size_t size; int cmd_index, i, pe; int *cmd_map = NULL; size = job->ntasks * sizeof(int); cmd_map = xmalloc(size); if (job->mpmd_set) { // Multiple programs, fill in from mpmd_set information for (i = 0; i < job->ntasks; i++) { cmd_map[i] = -1; } // Loop over the MPMD commands for (cmd_index = 0; cmd_index < job->mpmd_set->num_cmds; cmd_index++) { // Fill in start_pe to start_pe+total_pe for (i = 0, pe = job->mpmd_set->start_pe[cmd_index]; i < job->mpmd_set->total_pe[cmd_index]; i++, pe++) { if (pe >= job->ntasks) { CRAY_ERR("PE index %d too large", pe); xfree(cmd_map); return NULL; } cmd_map[pe] = cmd_index; } } // Verify the entire array was filled for (pe = 0; pe < job->ntasks; pe++) { if (cmd_map[pe] == -1) { CRAY_ERR("No command on PE index %d", pe); xfree(cmd_map); return NULL; } } } else { // Only one program, index 0 memset(cmd_map, 0, size); } return cmd_map; }
/* * If it wasn't created already, make the LLI_STATUS_FILE with given owner * and group, permissions 644, with given size */ static int _make_status_file(stepd_step_rec_t *job) { char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rv, fd; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Make the file errno = 0; fd = open(llifile, O_CREAT|O_EXCL|O_WRONLY, 0644); if (fd == -1) { // Another task_p_pre_launch_priv already created it, ignore if (errno == EEXIST) { return SLURM_SUCCESS; } CRAY_ERR("creat(%s) failed: %m", llifile); return SLURM_ERROR; } // Resize it rv = ftruncate(fd, job->node_tasks + 1); if (rv == -1) { CRAY_ERR("ftruncate(%s) failed: %m", llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Change owner/group so app can write to it rv = fchown(fd, job->uid, job->gid); if (rv == -1) { CRAY_ERR("chown(%s) failed: %m", llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } info("Created file %s", llifile); TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; }
/* * task_p_pre_launch() is called prior to exec of application task. * It is followed by TaskProlog program (from slurm.conf) and * --task-prolog (from srun command line). */ extern int task_p_pre_launch (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY int rc; debug("task_p_pre_launch: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); /* * Send the rank to the application's PMI layer via an environment * variable. */ rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV, "%d", job->envtp->procid); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV); return SLURM_ERROR; } /* * Set the PMI_NO_FORK environment variable. */ rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1"); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV); return SLURM_ERROR; } /* * Notify the task which offset to use */ rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV, "%d", job->envtp->localid + 1); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", LLI_STATUS_OFFS_ENV); return SLURM_ERROR; } #endif return SLURM_SUCCESS; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ int init(void) { verbose("%s loaded.", plugin_name); debug_flags = slurm_get_debug_flags(); #ifdef HAVE_NATIVE_CRAY if (MAX_PORT < MIN_PORT) { CRAY_ERR("MAX_PORT: %d < MIN_PORT: %d", MAX_PORT, MIN_PORT); return SLURM_ERROR; } #endif return SLURM_SUCCESS; }
/* * Write the IAA file and set the filename in the job's environment */ int write_iaa_file(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job, int *ptags, int num_ptags, alpsc_peInfo_t *alpsc_pe_info) { char *fname = xstrdup_printf(CRAY_IAA_FILE, sw_job->apid); int rc, ret = SLURM_ERROR; char *err_msg = NULL; do { // Write the file rc = alpsc_write_iaa_info(&err_msg, fname, sw_job->num_cookies, (const char **)sw_job->cookies, num_ptags, ptags, alpsc_pe_info); ALPSC_CN_DEBUG("alpsc_write_iaa_info"); if (rc != 1) { break; } // chown the file to the job user rc = chown(fname, job->uid, job->gid); if (rc == -1) { CRAY_ERR("chown(%s, %d, %d) failed: %m", fname, (int)job->uid, (int)job->gid); break; } // Write the environment variable rc = env_array_overwrite(&job->env, CRAY_IAA_INFO_FILE_ENV, fname); if (rc == 0) { CRAY_ERR("Failed to set env variable %s", CRAY_IAA_INFO_FILE_ENV); break; } ret = SLURM_SUCCESS; } while(0); xfree(fname); return ret; }
static void _state_read_buf(Buf buffer) { uint16_t protocol_version = (uint16_t) NO_VAL; uint32_t min_port, max_port; int i; /* Validate state version */ safe_unpack16(&protocol_version, buffer); debug3("Version in switch_cray header is %u", protocol_version); if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) { error("******************************************************"); error("Can't recover switch/cray state, incompatible version"); error("******************************************************"); return; } if (protocol_version >= SLURM_14_11_PROTOCOL_VERSION) { safe_unpack32(&min_port, buffer); safe_unpack32(&max_port, buffer); safe_unpack32(&last_alloc_port, buffer); unpack_bit_str(&port_resv, buffer); } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { uint8_t port_set = 0; safe_unpack32(&min_port, buffer); safe_unpack32(&max_port, buffer); safe_unpack32(&last_alloc_port, buffer); port_resv = bit_alloc(PORT_CNT); for (i = 0; i < PORT_CNT; i++) { safe_unpack8(&port_set, buffer); if (port_set) bit_set(port_resv, i); } } if ((min_port != MIN_PORT) || (max_port != MAX_PORT)) { error("******************************************************"); error("Can not recover switch/cray state"); error("Changed MIN_PORT (%u != %u) and/or MAX_PORT (%u != %u)", min_port, MIN_PORT, max_port, MAX_PORT); error("******************************************************"); return; } return; unpack_error: CRAY_ERR("unpack error"); return; }
/* * Get the command index. Note this is incompatible with MPMD so for now * we'll just return one of the command indices on this node. * Returns -1 if no command is found on this node. */ static int _get_cmd_index(stepd_step_rec_t *job) { int cmd_index; if (job->mpmd_set && job->mpmd_set->first_pe) { // Use the first index found in the list for (cmd_index = 0; cmd_index < job->mpmd_set->num_cmds; cmd_index++) { if (job->mpmd_set->first_pe[cmd_index] != -1) { return cmd_index; } } // If we've made it here we didn't find any on this node CRAY_ERR("No command found on this node"); return -1; } // Not an MPMD job, the one command has index 0 return 0; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { debug("%s loaded.", plugin_name); char *task_plugin = slurm_get_task_plugin(); char *task_cgroup = strstr(task_plugin, "cgroup"); char *task_cray = strstr(task_plugin, "cray"); if (!task_cgroup || !task_cray || task_cgroup < task_cray) fatal("task/cgroup must be used with, and listed after, " "task/cray in TaskPlugin"); xfree(task_plugin); #ifdef HAVE_NATIVE_CRAY int rc; struct stat st; debug_flags = slurm_get_debug_flags(); // Create the run directory errno = 0; rc = mkdir(TASK_CRAY_RUN_DIR, 0755); if (rc == -1 && errno != EEXIST) { CRAY_ERR("Couldn't create %s: %m", TASK_CRAY_RUN_DIR); return SLURM_ERROR; } // Determine whether to track app status with LLI rc = stat(LLI_SPOOL_DIR, &st); if (rc == -1) { debug("stat %s failed, disabling exit status tracking: %m", LLI_SPOOL_DIR); track_status = 0; } else { track_status = 1; } #endif return SLURM_SUCCESS; }
/* * Update the number of running steps on the node * Set val to 1 to increment and -1 to decrement the value * Returns the new value, or -1 on error */ static int _update_num_steps(int val) { int rc, fd, num_steps = 0; ssize_t size; off_t offset; struct flock lock; // Sanity check the argument if (val != 1 && val != -1) { CRAY_ERR("invalid val %d", val); return -1; } // Open the file fd = open(NUM_STEPS_FILE, O_RDWR | O_CREAT, 0644); if (fd == -1) { CRAY_ERR("open failed: %m"); return -1; } // Exclusive lock on the first byte of the file // Automatically released when the file descriptor is closed lock.l_type = F_WRLCK; lock.l_whence = SEEK_SET; lock.l_start = 0; lock.l_len = sizeof(int); rc = fcntl(fd, F_SETLKW, &lock); if (rc == -1) { CRAY_ERR("fcntl failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return -1; } // Read the value size = read(fd, &num_steps, sizeof(int)); if (size == -1) { CRAY_ERR("read failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return -1; } else if (size == 0) { // Value doesn't exist, must be the first step num_steps = 0; } // Increment or decrement and check result num_steps += val; if (num_steps < 0) { CRAY_ERR("Less than 0 steps on the node"); TEMP_FAILURE_RETRY(close(fd)); return 0; } // Write the new value offset = lseek(fd, 0, SEEK_SET); if (offset == -1) { CRAY_ERR("fseek failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return -1; } size = write(fd, &num_steps, sizeof(int)); if (size < sizeof(int)) { CRAY_ERR("write failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return -1; } if (debug_flags & DEBUG_FLAG_TASK) { debug("Wrote %d steps to %s", num_steps, NUM_STEPS_FILE); } TEMP_FAILURE_RETRY(close(fd)); return num_steps; }
static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array, cpu_set_t **cpuMasks) { struct bitmask **remaining_numa_node_cpus = NULL, *collective; unsigned long **numa_node_cpus = NULL; int i, j, at_least_one_cpu = 0, rc = 0; cpu_set_t *cpusetptr; char *bitmask_str = NULL; if (numa_available()) { CRAY_ERR("Libnuma not available"); return -1; } /* * numa_node_cpus: The CPUs available to the NUMA node. * numa_all_cpus_ptr: all CPUs on which the calling task may execute. * remaining_numa_node_cpus: Bitwise-AND of the above two to get all of * the CPUs that the task can run on in this * NUMA node. * collective: Collects all of the CPUs as a precaution. */ remaining_numa_node_cpus = xmalloc(num_numa_nodes * sizeof(struct bitmask *)); collective = numa_allocate_cpumask(); numa_node_cpus = xmalloc(num_numa_nodes * sizeof(unsigned long*)); for (i = 0; i < num_numa_nodes; i++) { remaining_numa_node_cpus[i] = numa_allocate_cpumask(); numa_node_cpus[i] = xmalloc(sizeof(unsigned long) * NUM_INTS_TO_HOLD_ALL_CPUS); rc = numa_node_to_cpus(numa_array[i], numa_node_cpus[i], NUM_INTS_TO_HOLD_ALL_CPUS); if (rc) { CRAY_ERR("numa_node_to_cpus failed: Return code %d", rc); } for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { (remaining_numa_node_cpus[i]->maskp[j]) = (numa_node_cpus[i][j]) & (numa_all_cpus_ptr->maskp[j]); collective->maskp[j] |= (remaining_numa_node_cpus[i]->maskp[j]); } } /* * Ensure that we have not masked off all of the CPUs. * If we have, just re-enable them all. Better to clear them all than * none of them. */ for (j = 0; j < collective->size; j++) { if (numa_bitmask_isbitset(collective, j)) { at_least_one_cpu = 1; } } if (!at_least_one_cpu) { for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < (remaining_numa_node_cpus[i]->size / (sizeof(unsigned long) * 8)); j++) { (remaining_numa_node_cpus[i]->maskp[j]) = (numa_all_cpus_ptr->maskp[j]); } } } if (debug_flags & DEBUG_FLAG_TASK) { bitmask_str = NULL; for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { xstrfmtcat(bitmask_str, "%6lx ", numa_node_cpus[i][j]); } } info("%sBitmask: Allowed CPUs for NUMA Node", bitmask_str); xfree(bitmask_str); bitmask_str = NULL; for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { xstrfmtcat(bitmask_str, "%6lx ", numa_all_cpus_ptr->maskp[j]); } } info("%sBitmask: Allowed CPUs for cpuset", bitmask_str); xfree(bitmask_str); bitmask_str = NULL; for (i = 0; i < num_numa_nodes; i++) { for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) { xstrfmtcat(bitmask_str, "%6lx ", remaining_numa_node_cpus[i]-> maskp[j]); } } info("%sBitmask: Allowed CPUs between cpuset and NUMA Node", bitmask_str); xfree(bitmask_str); } // Convert bitmasks to cpu_set_t types cpusetptr = xmalloc(num_numa_nodes * sizeof(cpu_set_t)); for (i = 0; i < num_numa_nodes; i++) { CPU_ZERO(&cpusetptr[i]); for (j = 0; j < remaining_numa_node_cpus[i]->size; j++) { if (numa_bitmask_isbitset(remaining_numa_node_cpus[i], j)) { CPU_SET(j, &cpusetptr[i]); } } if (debug_flags & DEBUG_FLAG_TASK) { info("CPU_COUNT() of set: %d", CPU_COUNT(&cpusetptr[i])); } } *cpuMasks = cpusetptr; // Freeing Everything numa_free_cpumask(collective); for (i = 0; i < num_numa_nodes; i++) { xfree(numa_node_cpus[i]); numa_free_cpumask(remaining_numa_node_cpus[i]); } xfree(numa_node_cpus); xfree(numa_node_cpus); xfree(remaining_numa_node_cpus); return 0; }
/* * task_p_post_step() is called after termination of the step * (all the tasks) */ extern int task_p_post_step (stepd_step_rec_t *job) { #ifdef HAVE_NATIVE_CRAY char llifile[LLI_STATUS_FILE_BUF_SIZE]; int rc, cnt; char *err_msg = NULL, path[PATH_MAX]; int32_t *numa_nodes; cpu_set_t *cpuMasks; if (track_status) { // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Unlink the file errno = 0; rc = unlink(llifile); if (rc == -1 && errno != ENOENT) { CRAY_ERR("unlink(%s) failed: %m", llifile); } else if (rc == 0) { info("Unlinked %s", llifile); } } /* * Compact Memory * * Determine which NUMA nodes and CPUS an application is using. It will * be used to compact the memory. * * You'll find the information in the following location. * For a normal job step: * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/ * * For a batch job step (only on the head node and only for batch jobs): * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/ * * NUMA node: mems * CPU Masks: cpus */ if (job->batch) { // Batch Job Step rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_batch", job->uid, job->jobid); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); return SLURM_ERROR; } } else { // Normal Job Step /* Only run epilogue on non-batch steps */ _step_epilogue(); rc = snprintf(path, sizeof(path), "/dev/cpuset/slurm/uid_%d/job_%" PRIu32 "/step_%" PRIu32, job->uid, job->jobid, job->stepid); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); return SLURM_ERROR; } } rc = _get_numa_nodes(path, &cnt, &numa_nodes); if (rc < 0) { CRAY_ERR("get_numa_nodes failed. Return code: %d", rc); return SLURM_ERROR; } rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks); if (rc < 0) { CRAY_ERR("get_cpu_masks failed. Return code: %d", rc); return SLURM_ERROR; } /* * Compact Memory * The last argument which is a path to the cpuset directory has to be * NULL because the CPUSET directory has already been cleaned up. */ rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL); _ALPSC_DEBUG("alpsc_compact_mem"); xfree(numa_nodes); xfree(cpuMasks); if (rc != 1) { return SLURM_ERROR; } #endif return SLURM_SUCCESS; }
/* * Check the status file for the exit of the given local task id * and terminate the job step if an improper exit is found */ static int _check_status_file(stepd_step_rec_t *job) { char llifile[LLI_STATUS_FILE_BUF_SIZE]; char status; int rv, fd; stepd_step_task_info_t *task; char *reason; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Open the lli file. fd = open(llifile, O_RDONLY); if (fd == -1) { CRAY_ERR("open(%s) failed: %m", llifile); return SLURM_ERROR; } // Read the first byte (indicates starting) rv = read(fd, &status, sizeof(status)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_ERROR; } // If the first byte is 0, we either aren't an MPI app or // it didn't make it past pmi_init, in any case, return success if (status == 0) { TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; } // Seek to the correct offset rv = lseek(fd, job->envtp->localid + 1, SEEK_SET); if (rv == -1) { CRAY_ERR("lseek failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Read the exiting byte rv = read(fd, &status, sizeof(status)); TEMP_FAILURE_RETRY(close(fd)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_SUCCESS; } // Check the result if (status == 0 && !terminated) { task = job->task[job->envtp->localid]; if (task->killed_by_cmd) { // We've been killed by request. User already knows return SLURM_SUCCESS; } else if (task->aborted) { reason = "aborted"; } else if (WIFSIGNALED(task->estatus)) { reason = "signaled"; } else { reason = "exited"; } // Cancel the job step, since we didn't find the exiting msg error("Terminating job step %"PRIu32".%"PRIu32 "; task %d exit code %d %s without notification", job->jobid, job->stepid, task->gtid, WEXITSTATUS(task->estatus), reason); terminated = 1; slurm_terminate_job_step(job->jobid, job->stepid); } return SLURM_SUCCESS; }
/* * Fill in an alpsc_peInfo_t structure */ int build_alpsc_pe_info(stepd_step_rec_t *job, alpsc_peInfo_t *alpsc_pe_info, int *cmd_index) { // Sanity check everything here so we don't need to // do it everywhere else if (job == NULL) { CRAY_ERR("NULL job pointer"); return SLURM_ERROR; } else if (job->ntasks < 1) { CRAY_ERR("Not enough tasks %d", job->ntasks); return SLURM_ERROR; } else if (alpsc_pe_info == NULL) { CRAY_ERR("NULL alpsc_pe_info"); return SLURM_ERROR; } else if (cmd_index == NULL) { CRAY_ERR("NULL cmd_index"); return SLURM_ERROR; } else if (job->multi_prog) { if (job->mpmd_set == NULL) { CRAY_ERR("MPMD launch but no mpmd_set"); return SLURM_ERROR; } else if (job->mpmd_set->first_pe == NULL) { CRAY_ERR("NULL first_pe"); return SLURM_ERROR; } else if (job->mpmd_set->start_pe == NULL) { CRAY_ERR("NULL start_pe"); return SLURM_ERROR; } else if (job->mpmd_set->total_pe == NULL) { CRAY_ERR("NULL total_pe"); return SLURM_ERROR; } else if (job->mpmd_set->placement == NULL) { CRAY_ERR("NULL placement"); return SLURM_ERROR; } else if (job->mpmd_set->num_cmds < 1) { CRAY_ERR("Not enough commands %d", job->mpmd_set->num_cmds); return SLURM_ERROR; } } // Fill in the structure alpsc_pe_info->totalPEs = job->ntasks; alpsc_pe_info->firstPeHere = _get_first_pe(job); alpsc_pe_info->pesHere = job->node_tasks; alpsc_pe_info->peDepth = job->cpus_per_task; alpsc_pe_info->peNidArray = _get_pe_nid_map(job); alpsc_pe_info->peCmdMapArray = _get_cmd_map(job); alpsc_pe_info->nodeCpuArray = _get_node_cpu_map(job); // Get the command index *cmd_index = _get_cmd_index(job); // Check results if (alpsc_pe_info->peNidArray == NULL || alpsc_pe_info->peCmdMapArray == NULL || alpsc_pe_info->nodeCpuArray == NULL || *cmd_index == -1) { free_alpsc_pe_info(alpsc_pe_info); return SLURM_ERROR; } // Print pe info if debug flag is set if (debug_flags & DEBUG_FLAG_SWITCH) { _print_alpsc_pe_info(alpsc_pe_info, *cmd_index); } return SLURM_SUCCESS; }
/* * Check the status file for the exit of the given local task id * and terminate the job step if an improper exit is found */ static int _check_status_file(stepd_step_rec_t *job, stepd_step_task_info_t *task) { char llifile[LLI_STATUS_FILE_BUF_SIZE]; char status; int rv, fd; debug("task_p_post_term: %u.%u, task %d", job->jobid, job->stepid, job->envtp->procid); // We only need to special case termination with exit(0) // srun already handles abnormal exit conditions fine if (!WIFEXITED(task->estatus) || (WEXITSTATUS(task->estatus) != 0)) return SLURM_SUCCESS; // Get the lli file name snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE, SLURM_ID_HASH(job->jobid, job->stepid)); // Open the lli file. fd = open(llifile, O_RDONLY); if (fd == -1) { // There's a timing issue for large jobs; this file could // already be cleaned up by the time we get here. // However, this is during a normal cleanup so no big deal. debug("open(%s) failed: %m", llifile); return SLURM_SUCCESS; } // Read the first byte (indicates starting) rv = read(fd, &status, sizeof(status)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_ERROR; } // If the first byte is 0, we either aren't an MPI app or // it didn't make it past pmi_init, in any case, return success if (status == 0) { TEMP_FAILURE_RETRY(close(fd)); return SLURM_SUCCESS; } // Seek to the correct offset rv = lseek(fd, job->envtp->localid + 1, SEEK_SET); if (rv == -1) { CRAY_ERR("lseek failed: %m"); TEMP_FAILURE_RETRY(close(fd)); return SLURM_ERROR; } // Read the exiting byte rv = read(fd, &status, sizeof(status)); TEMP_FAILURE_RETRY(close(fd)); if (rv == -1) { CRAY_ERR("read failed: %m"); return SLURM_SUCCESS; } // Check the result if (status == 0) { if (task->killed_by_cmd) { // We've been killed by request. User already knows return SLURM_SUCCESS; } verbose("step %u.%u task %u exited without calling " "PMI_Finalize()", job->jobid, job->stepid, task->gtid); } return SLURM_SUCCESS; }
int switch_p_libstate_restore(char *dir_name, bool recover) { #ifdef HAVE_NATIVE_CRAY char *data = NULL, *file_name; Buf buffer = NULL; int error_code = SLURM_SUCCESS; int state_fd, data_allocated = 0, data_read = 0, data_size = 0; xassert(dir_name != NULL); if (debug_flags & DEBUG_FLAG_SWITCH) { CRAY_INFO("restore from %s, recover %d", dir_name, (int) recover); } if (!recover) /* clean start, no recovery */ return SLURM_SUCCESS; file_name = xstrdup(dir_name); xstrcat(file_name, "/switch_cray_state"); state_fd = open (file_name, O_RDONLY); if (state_fd >= 0) { data_allocated = SWITCH_BUF_SIZE; data = xmalloc(data_allocated); while (1) { data_read = read (state_fd, &data[data_size], SWITCH_BUF_SIZE); if ((data_read < 0) && (errno == EINTR)) continue; if (data_read < 0) { CRAY_ERR("Read error on %s, %m", file_name); error_code = SLURM_ERROR; break; } else if (data_read == 0) break; data_size += data_read; data_allocated += data_read; xrealloc(data, data_allocated); } close (state_fd); (void) unlink(file_name); /* One chance to recover */ xfree(file_name); } else { CRAY_ERR("No %s file for switch/cray state recovery", file_name); CRAY_ERR("Starting switch/cray with clean state"); xfree(file_name); return SLURM_SUCCESS; } if (error_code == SLURM_SUCCESS) { buffer = create_buf (data, data_size); data = NULL; /* now in buffer, don't xfree() */ _state_read_buf(buffer); } if (buffer) free_buf(buffer); xfree(data); #endif return SLURM_SUCCESS; }
/* * Function: _get_numa_nodes * Description: * Returns a count of the NUMA nodes that the application is running on. * * Returns an array of NUMA nodes that the application is running on. * * * IN char* path -- The path to the directory containing the files containing * information about NUMA nodes. * * OUT *cnt -- The number of NUMA nodes in the array * OUT **numa_array -- An integer array containing the NUMA nodes. * This array must be xfreed by the caller. * * RETURN * 0 on success and -1 on failure. */ static int _get_numa_nodes(char *path, int *cnt, int32_t **numa_array) { struct bitmask *bm; int i, index, rc = 0; int lsz; size_t sz; char buffer[PATH_MAX]; FILE *f = NULL; char *lin = NULL; rc = snprintf(buffer, sizeof(buffer), "%s/%s", path, "mems"); if (rc < 0) { CRAY_ERR("snprintf failed. Return code: %d", rc); } f = fopen(buffer, "r"); if (f == NULL ) { CRAY_ERR("Failed to open file %s: %m", buffer); return -1; } lsz = getline(&lin, &sz, f); if (lsz > 0) { if (lin[strlen(lin) - 1] == '\n') { lin[strlen(lin) - 1] = '\0'; } bm = numa_parse_nodestring(lin); if (bm == NULL ) { CRAY_ERR("Error numa_parse_nodestring:" " Invalid node string: %s", lin); free(lin); return SLURM_ERROR; } } else { CRAY_ERR("Reading %s failed", buffer); return SLURM_ERROR; } free(lin); *cnt = numa_bitmask_weight(bm); if (*cnt == 0) { CRAY_ERR("No NUMA Nodes found"); return -1; } if (debug_flags & DEBUG_FLAG_TASK) { info("Bitmask %#lx size: %lu sizeof(*(bm->maskp)): %zd" " weight: %u", *(bm->maskp), bm->size, sizeof(*(bm->maskp)), *cnt); } *numa_array = xmalloc(*cnt * sizeof(int32_t)); index = 0; for (i = 0; i < bm->size; i++) { if (*(bm->maskp) & ((long unsigned) 1 << i)) { if (debug_flags & DEBUG_FLAG_TASK) { info("(%s: %d: %s) NUMA Node %d is present", THIS_FILE, __LINE__, __FUNCTION__, i); } (*numa_array)[index++] = i; } } numa_free_nodemask(bm); return 0; }
static void _state_read_buf(Buf buffer) { uint16_t protocol_version = (uint16_t) NO_VAL; uint32_t min_port, max_port; int i; /* Validate state version */ safe_unpack16(&protocol_version, buffer); debug3("Version in switch_cray header is %u", protocol_version); if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) { error("******************************************************"); error("Can't recover switch/cray state, incompatible version"); error("******************************************************"); return; } pthread_mutex_lock(&port_mutex); if (protocol_version >= SLURM_14_11_PROTOCOL_VERSION) { safe_unpack32(&min_port, buffer); safe_unpack32(&max_port, buffer); safe_unpack32(&last_alloc_port, buffer); /* make sure we are NULL here */ FREE_NULL_BITMAP(port_resv); unpack_bit_str_hex(&port_resv, buffer); } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { uint8_t port_set = 0; safe_unpack32(&min_port, buffer); safe_unpack32(&max_port, buffer); safe_unpack32(&last_alloc_port, buffer); /* make sure we are NULL here */ FREE_NULL_BITMAP(port_resv); port_resv = bit_alloc(PORT_CNT); for (i = 0; i < PORT_CNT; i++) { safe_unpack8(&port_set, buffer); if (port_set) bit_set(port_resv, i); } } if (!port_resv || (bit_size(port_resv) != PORT_CNT)) { error("_state_read_buf: Reserve Port size was %d not %d, " "reallocating", port_resv ? bit_size(port_resv) : -1, PORT_CNT); port_resv = bit_realloc(port_resv, PORT_CNT); } pthread_mutex_unlock(&port_mutex); if ((min_port != MIN_PORT) || (max_port != MAX_PORT)) { error("******************************************************"); error("Can not recover switch/cray state"); error("Changed MIN_PORT (%u != %u) and/or MAX_PORT (%u != %u)", min_port, MIN_PORT, max_port, MAX_PORT); error("******************************************************"); return; } return; unpack_error: CRAY_ERR("unpack error"); return; }
/* * Get the pe to nid map, or NULL on error */ static int *_get_pe_nid_map(stepd_step_rec_t *job) { size_t size; int *pe_nid_map = NULL; int cnt = 0, task, i, j, rc; int32_t *nodes = NULL; int tasks_to_launch_sum, nid; size = job->ntasks * sizeof(int); pe_nid_map = xmalloc(size); // If we have it, just copy the mpmd set information if (job->mpmd_set && job->mpmd_set->placement) { // mpmd_set->placement is an int * too so this works memcpy(pe_nid_map, job->mpmd_set->placement, size); } else { // Initialize to -1 so we can tell if we missed any for (i = 0; i < job->ntasks; i++) { pe_nid_map[i] = -1; } // Convert the node list to an array of nids rc = list_str_to_array(job->msg->complete_nodelist, &cnt, &nodes); if (rc < 0) { xfree(pe_nid_map); return NULL; } else if (job->nnodes != cnt) { CRAY_ERR("list_str_to_array cnt %d expected %u", cnt, job->nnodes); xfree(pe_nid_map); xfree(nodes); return NULL; } // Search the task id map for the values we need tasks_to_launch_sum = 0; for (i = 0; i < job->nnodes; i++) { tasks_to_launch_sum += job->msg->tasks_to_launch[i]; for (j = 0; j < job->msg->tasks_to_launch[i]; j++) { task = job->msg->global_task_ids[i][j]; pe_nid_map[task] = nodes[i]; } } // If this is LAM/MPI only one task per node is launched, // NOT job->ntasks. So fill in the rest of the tasks // assuming a block distribution if (tasks_to_launch_sum == job->nnodes && job->nnodes < job->ntasks) { nid = nodes[0]; // failsafe value for (i = 0; i < job->ntasks; i++) { if (pe_nid_map[i] > -1) { nid = pe_nid_map[i]; } else { pe_nid_map[i] = nid; } } } xfree(nodes); // Make sure we didn't miss any tasks for (i = 0; i < job->ntasks; i++) { if (pe_nid_map[i] == -1) { CRAY_ERR("No NID for PE index %d", i); xfree(pe_nid_map); return NULL; } } } return pe_nid_map; }