Ejemplo n.º 1
0
/*
 * Remove a cookie from the tracked cookie list
 */
static void _remove_cookie(int32_t cookie_id)
{
	int32_t i;
	int found = 0;

	// Lock the mutex
	slurm_mutex_lock(&cookie_id_mutex);

	// Find a match in the list
	for (i = 0; i < cookie_id_list_size; i++) {
		if (cookie_id_list[i] == cookie_id) {
			// Copy the last id to this spot
			if (i < cookie_id_list_size - 1) {
				cookie_id_list[i] =
					cookie_id_list[cookie_id_list_size - 1];
			}

			found = 1;
			cookie_id_list_size--;
			break;
		}
	}
	if (!found) {
		CRAY_INFO("Cookie %"PRId32" not found in tracked cookie list",
			  cookie_id);
	}

	// Unlock the mutex
	slurm_mutex_unlock(&cookie_id_mutex);
}
Ejemplo n.º 2
0
/*
 * switch functions for global state save/restore
 */
int switch_p_libstate_save(char *dir_name)
{
#ifdef HAVE_NATIVE_CRAY
    Buf buffer;
    char *file_name;
    int ret = SLURM_SUCCESS;
    int state_fd;

    xassert(dir_name != NULL);

    if (debug_flags & DEBUG_FLAG_SWITCH)
        CRAY_INFO("save to %s", dir_name);

    buffer = init_buf(SWITCH_BUF_SIZE);
    _state_write_buf(buffer);
    file_name = xstrdup(dir_name);
    xstrcat(file_name, "/switch_cray_state");
    (void) unlink(file_name);
    state_fd = creat(file_name, 0600);
    if (state_fd < 0) {
        CRAY_ERR("Can't save state, error creating file %s %m",
                 file_name);
        ret = SLURM_ERROR;
    } else {
        char  *buf = get_buf_data(buffer);
        size_t len = get_buf_offset(buffer);
        while (1) {
            int wrote = write(state_fd, buf, len);
            if ((wrote < 0) && (errno == EINTR))
                continue;
            if (wrote == 0)
                break;
            if (wrote < 0) {
                CRAY_ERR("Can't save switch state: %m");
                ret = SLURM_ERROR;
                break;
            }
            buf += wrote;
            len -= wrote;
        }
        close(state_fd);
    }
    xfree(file_name);

    if (buffer)
        free_buf(buffer);

    return ret;
#else
    return SLURM_SUCCESS;
#endif
}
Ejemplo n.º 3
0
/*
 * Determines the memory scaling amount to use.
 * Returns -1 on failure.
 */
int get_mem_scaling(stepd_step_rec_t *job)
{
	int mem_scaling;
	uint32_t total_mem;

	/*
	 * Get the memory amount
	 */
	total_mem = _get_mem_total();
	if (total_mem == 0) {
		CRAY_ERR("Scanning /proc/meminfo results in MemTotal=0");
		return -1;
	}

	/*
	 * Scale total_mem, which is in kilobytes, to megabytes because
	 * app_mem is in megabytes.
	 * Round to the nearest integer.
	 * If the memory request is greater than 100 percent, then scale
	 * it to 100%.
	 * If the memory request is zero, then return an error.
	 *
	 * Note: Because this has caused some confusion in the past,
	 * The MEM_PER_CPU flag is used to indicate that job->step_mem
	 * is the amount of memory per CPU, not total.  However, this
	 * flag is read and cleared in slurmd prior to passing this
	 * value to slurmstepd.
	 * The value comes to slurmstepd already properly scaled.
	 * Thus, this function does not need to check the MEM_PER_CPU
	 * flag.
	 */
	mem_scaling = ((((double) job->step_mem /
			 ((double) total_mem / 1024)) * (double) 100))
		+ 0.5;
	if (mem_scaling > MAX_SCALING) {
		CRAY_INFO("Memory scaling out of bounds: %d. "
			  "Reducing to %d%%.",
			  mem_scaling, MAX_SCALING);
		mem_scaling = MAX_SCALING;
	}

	if (mem_scaling < MIN_SCALING) {
		CRAY_ERR("Memory scaling out of bounds: %d. "
			 "Increasing to %d%%",
			 mem_scaling, MIN_SCALING);
		mem_scaling = MIN_SCALING;
	}

	return mem_scaling;
}
Ejemplo n.º 4
0
static void *_lease_extender(void *args)
{
	int rc;
	char *err_msg = NULL;

	CRAY_INFO("Leasing cookies for %ds, renewing every %ds",
		  COOKIE_LEASE_TIME, COOKIE_LEASE_INTERVAL);

	lease_extender_running = true;

	while (lease_extender_running) {
		// Lock the mutex
		slurm_mutex_lock(&cookie_id_mutex);

		// If there are cookies, extend their leases
		if (cookie_id_list_size > 0) {
			// Extend the cookie leases
			CRAY_INFO("Extending leases for %"PRId32" cookies",
				  cookie_id_list_size);

			rc = alpsc_set_cookie_lease(&err_msg, cookie_id_list,
						    cookie_id_list_size,
						    COOKIE_LEASE_TIME);
			ALPSC_SN_DEBUG("alpsc_set_cookie_lease");

			// Just ignore errors, not much we can do about them
		}

		// Unlock the mutex
		slurm_mutex_unlock(&cookie_id_mutex);

		// Wait until we want to extend leases again
		sleep(COOKIE_LEASE_INTERVAL);
	}
	return NULL;
}
Ejemplo n.º 5
0
/*
 * Add a cookie to the tracked cookie list
 */
static void _add_cookie(int32_t cookie_id)
{
	int32_t i;

	// Lock the mutex
	slurm_mutex_lock(&cookie_id_mutex);

	// If the cookie is already in the list, skip
	for (i = 0; i < cookie_id_list_size; i++) {
		if (cookie_id_list[i] == cookie_id) {
			slurm_mutex_unlock(&cookie_id_mutex);
			CRAY_INFO("Duplicate cookie %"PRId32" found in tracked"
				  " cookie list", cookie_id);
			return;
		}
	}

	// Extend id list if necessary
	if (cookie_id_list_size + 1 > cookie_id_list_capacity) {
		if (cookie_id_list_capacity == 0) {
			cookie_id_list_capacity = 2048;
		} else {
			cookie_id_list_capacity *= 2;
		}
		cookie_id_list = xrealloc(cookie_id_list,
					  (cookie_id_list_capacity
					   * sizeof(int32_t)));
	}

	// Set value
	cookie_id_list[cookie_id_list_size] = cookie_id;
	cookie_id_list_size++;

	// Unlock the mutex
	slurm_mutex_unlock(&cookie_id_mutex);
}
Ejemplo n.º 6
0
int switch_p_libstate_restore(char *dir_name, bool recover)
{
#ifdef HAVE_NATIVE_CRAY
    char *data = NULL, *file_name;
    Buf buffer = NULL;
    int error_code = SLURM_SUCCESS;
    int state_fd, data_allocated = 0, data_read = 0, data_size = 0;

    xassert(dir_name != NULL);

    if (debug_flags & DEBUG_FLAG_SWITCH) {
        CRAY_INFO("restore from %s, recover %d",
                  dir_name,  (int) recover);
    }

    if (!recover)		/* clean start, no recovery */
        return SLURM_SUCCESS;

    file_name = xstrdup(dir_name);
    xstrcat(file_name, "/switch_cray_state");
    state_fd = open (file_name, O_RDONLY);
    if (state_fd >= 0) {
        data_allocated = SWITCH_BUF_SIZE;
        data = xmalloc(data_allocated);
        while (1) {
            data_read = read (state_fd, &data[data_size],
                              SWITCH_BUF_SIZE);
            if ((data_read < 0) && (errno == EINTR))
                continue;
            if (data_read < 0) {
                CRAY_ERR("Read error on %s, %m", file_name);
                error_code = SLURM_ERROR;
                break;
            } else if (data_read == 0)
                break;
            data_size      += data_read;
            data_allocated += data_read;
            xrealloc(data, data_allocated);
        }
        close (state_fd);
        (void) unlink(file_name);	/* One chance to recover */
        xfree(file_name);
    } else {
        CRAY_ERR("No %s file for switch/cray state recovery",
                 file_name);
        CRAY_ERR("Starting switch/cray with clean state");
        xfree(file_name);
        return SLURM_SUCCESS;
    }

    if (error_code == SLURM_SUCCESS) {
        buffer = create_buf (data, data_size);
        data = NULL;	/* now in buffer, don't xfree() */
        _state_read_buf(buffer);
    }

    if (buffer)
        free_buf(buffer);
    xfree(data);
#endif
    return SLURM_SUCCESS;
}