/* * Remove a cookie from the tracked cookie list */ static void _remove_cookie(int32_t cookie_id) { int32_t i; int found = 0; // Lock the mutex slurm_mutex_lock(&cookie_id_mutex); // Find a match in the list for (i = 0; i < cookie_id_list_size; i++) { if (cookie_id_list[i] == cookie_id) { // Copy the last id to this spot if (i < cookie_id_list_size - 1) { cookie_id_list[i] = cookie_id_list[cookie_id_list_size - 1]; } found = 1; cookie_id_list_size--; break; } } if (!found) { CRAY_INFO("Cookie %"PRId32" not found in tracked cookie list", cookie_id); } // Unlock the mutex slurm_mutex_unlock(&cookie_id_mutex); }
/* * switch functions for global state save/restore */ int switch_p_libstate_save(char *dir_name) { #ifdef HAVE_NATIVE_CRAY Buf buffer; char *file_name; int ret = SLURM_SUCCESS; int state_fd; xassert(dir_name != NULL); if (debug_flags & DEBUG_FLAG_SWITCH) CRAY_INFO("save to %s", dir_name); buffer = init_buf(SWITCH_BUF_SIZE); _state_write_buf(buffer); file_name = xstrdup(dir_name); xstrcat(file_name, "/switch_cray_state"); (void) unlink(file_name); state_fd = creat(file_name, 0600); if (state_fd < 0) { CRAY_ERR("Can't save state, error creating file %s %m", file_name); ret = SLURM_ERROR; } else { char *buf = get_buf_data(buffer); size_t len = get_buf_offset(buffer); while (1) { int wrote = write(state_fd, buf, len); if ((wrote < 0) && (errno == EINTR)) continue; if (wrote == 0) break; if (wrote < 0) { CRAY_ERR("Can't save switch state: %m"); ret = SLURM_ERROR; break; } buf += wrote; len -= wrote; } close(state_fd); } xfree(file_name); if (buffer) free_buf(buffer); return ret; #else return SLURM_SUCCESS; #endif }
/* * Determines the memory scaling amount to use. * Returns -1 on failure. */ int get_mem_scaling(stepd_step_rec_t *job) { int mem_scaling; uint32_t total_mem; /* * Get the memory amount */ total_mem = _get_mem_total(); if (total_mem == 0) { CRAY_ERR("Scanning /proc/meminfo results in MemTotal=0"); return -1; } /* * Scale total_mem, which is in kilobytes, to megabytes because * app_mem is in megabytes. * Round to the nearest integer. * If the memory request is greater than 100 percent, then scale * it to 100%. * If the memory request is zero, then return an error. * * Note: Because this has caused some confusion in the past, * The MEM_PER_CPU flag is used to indicate that job->step_mem * is the amount of memory per CPU, not total. However, this * flag is read and cleared in slurmd prior to passing this * value to slurmstepd. * The value comes to slurmstepd already properly scaled. * Thus, this function does not need to check the MEM_PER_CPU * flag. */ mem_scaling = ((((double) job->step_mem / ((double) total_mem / 1024)) * (double) 100)) + 0.5; if (mem_scaling > MAX_SCALING) { CRAY_INFO("Memory scaling out of bounds: %d. " "Reducing to %d%%.", mem_scaling, MAX_SCALING); mem_scaling = MAX_SCALING; } if (mem_scaling < MIN_SCALING) { CRAY_ERR("Memory scaling out of bounds: %d. " "Increasing to %d%%", mem_scaling, MIN_SCALING); mem_scaling = MIN_SCALING; } return mem_scaling; }
static void *_lease_extender(void *args) { int rc; char *err_msg = NULL; CRAY_INFO("Leasing cookies for %ds, renewing every %ds", COOKIE_LEASE_TIME, COOKIE_LEASE_INTERVAL); lease_extender_running = true; while (lease_extender_running) { // Lock the mutex slurm_mutex_lock(&cookie_id_mutex); // If there are cookies, extend their leases if (cookie_id_list_size > 0) { // Extend the cookie leases CRAY_INFO("Extending leases for %"PRId32" cookies", cookie_id_list_size); rc = alpsc_set_cookie_lease(&err_msg, cookie_id_list, cookie_id_list_size, COOKIE_LEASE_TIME); ALPSC_SN_DEBUG("alpsc_set_cookie_lease"); // Just ignore errors, not much we can do about them } // Unlock the mutex slurm_mutex_unlock(&cookie_id_mutex); // Wait until we want to extend leases again sleep(COOKIE_LEASE_INTERVAL); } return NULL; }
/* * Add a cookie to the tracked cookie list */ static void _add_cookie(int32_t cookie_id) { int32_t i; // Lock the mutex slurm_mutex_lock(&cookie_id_mutex); // If the cookie is already in the list, skip for (i = 0; i < cookie_id_list_size; i++) { if (cookie_id_list[i] == cookie_id) { slurm_mutex_unlock(&cookie_id_mutex); CRAY_INFO("Duplicate cookie %"PRId32" found in tracked" " cookie list", cookie_id); return; } } // Extend id list if necessary if (cookie_id_list_size + 1 > cookie_id_list_capacity) { if (cookie_id_list_capacity == 0) { cookie_id_list_capacity = 2048; } else { cookie_id_list_capacity *= 2; } cookie_id_list = xrealloc(cookie_id_list, (cookie_id_list_capacity * sizeof(int32_t))); } // Set value cookie_id_list[cookie_id_list_size] = cookie_id; cookie_id_list_size++; // Unlock the mutex slurm_mutex_unlock(&cookie_id_mutex); }
int switch_p_libstate_restore(char *dir_name, bool recover) { #ifdef HAVE_NATIVE_CRAY char *data = NULL, *file_name; Buf buffer = NULL; int error_code = SLURM_SUCCESS; int state_fd, data_allocated = 0, data_read = 0, data_size = 0; xassert(dir_name != NULL); if (debug_flags & DEBUG_FLAG_SWITCH) { CRAY_INFO("restore from %s, recover %d", dir_name, (int) recover); } if (!recover) /* clean start, no recovery */ return SLURM_SUCCESS; file_name = xstrdup(dir_name); xstrcat(file_name, "/switch_cray_state"); state_fd = open (file_name, O_RDONLY); if (state_fd >= 0) { data_allocated = SWITCH_BUF_SIZE; data = xmalloc(data_allocated); while (1) { data_read = read (state_fd, &data[data_size], SWITCH_BUF_SIZE); if ((data_read < 0) && (errno == EINTR)) continue; if (data_read < 0) { CRAY_ERR("Read error on %s, %m", file_name); error_code = SLURM_ERROR; break; } else if (data_read == 0) break; data_size += data_read; data_allocated += data_read; xrealloc(data, data_allocated); } close (state_fd); (void) unlink(file_name); /* One chance to recover */ xfree(file_name); } else { CRAY_ERR("No %s file for switch/cray state recovery", file_name); CRAY_ERR("Starting switch/cray with clean state"); xfree(file_name); return SLURM_SUCCESS; } if (error_code == SLURM_SUCCESS) { buffer = create_buf (data, data_size); data = NULL; /* now in buffer, don't xfree() */ _state_read_buf(buffer); } if (buffer) free_buf(buffer); xfree(data); #endif return SLURM_SUCCESS; }