Ejemplo n.º 1
0
Archivo: ports.c Proyecto: BYUHPC/slurm
/*
 * Function: release_port
 * Description:
 *  Release the port.
 *
 * Returns:
 *  0 on success and -1 on failure.
 */
int release_port(uint32_t real_port)
{

	uint32_t port;

	if ((real_port < MIN_PORT) || (real_port >= MAX_PORT)) {
		CRAY_ERR("Port %" PRIu32 " outside of valid range %" PRIu32
			 " : %" PRIu32, real_port, MIN_PORT, MAX_PORT);
		return -1;
	}

	port = real_port - MIN_PORT;

	pthread_mutex_lock(&port_mutex);
	if (bit_test(port_resv, port)) {
		bit_clear(port_resv, port);
		pthread_mutex_unlock(&port_mutex);
	} else {
		CRAY_ERR("Attempting to release port %d,"
			 " but it was not reserved.", real_port);
		pthread_mutex_unlock(&port_mutex);
		return -1;
	}
	return 0;
}
Ejemplo n.º 2
0
/*
 * Start the thread to extend cookie leases.
 */
extern int start_lease_extender(void)
{
	pthread_attr_t attr_agent;
	pthread_t thread_agent;
	int retries = 0;

	// Start lease extender in the slurmctld
	if (!_in_slurmctld())
		return SLURM_SUCCESS;

	/* spawn an agent */
	slurm_attr_init(&attr_agent);
	if (pthread_attr_setdetachstate(&attr_agent,
					PTHREAD_CREATE_DETACHED)) {
		CRAY_ERR("pthread_attr_setdetachstate error %m");
	}

	retries = 0;
	while (pthread_create(&thread_agent, &attr_agent,
			      &_lease_extender, NULL)) {
		error("pthread_create error %m");
		if (++retries > 1) {
			CRAY_ERR("Can't create pthread");
			slurm_attr_destroy(&attr_agent);
			return SLURM_ERROR;
		}

		usleep(1000);	/* sleep and retry */
	}
	slurm_attr_destroy(&attr_agent);
	return SLURM_SUCCESS;
}
Ejemplo n.º 3
0
/*
 * task_p_pre_launch() is called prior to exec of application task.
 *	It is followed by TaskProlog program (from slurm.conf) and
 *	--task-prolog (from srun command line).
 */
extern int task_p_pre_launch (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	int rc;
	uint64_t apid;
	DEF_TIMERS;

	START_TIMER;
	apid = SLURM_ID_HASH(job->jobid, job->stepid);
	debug2("task_p_pre_launch: %u.%u, apid %"PRIu64", task %d",
	       job->jobid, job->stepid, apid, job->envtp->procid);

	/*
	 * Send the rank to the application's PMI layer via an environment
	 * variable.
	 */
	rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV,
				     "%d", job->envtp->procid);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV);
		return SLURM_ERROR;
	}

	/*
	 * Set the PMI_NO_FORK environment variable.
	 */
	rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1");
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV);
		return SLURM_ERROR;
	}

	/*
	 *  Notify the task which offset to use
	 */
	rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV,
				     "%d", job->envtp->localid + 1);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s",
			 LLI_STATUS_OFFS_ENV);
		return SLURM_ERROR;
	}

	/*
	 * Set the ALPS_APP_ID environment variable for use by
	 * Cray tools.
	 */
	rc = env_array_overwrite_fmt(&job->env, ALPS_APP_ID_ENV, "%"PRIu64,
				     apid);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s",
			 ALPS_APP_ID_ENV);
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);
#endif
	return SLURM_SUCCESS;
}
Ejemplo n.º 4
0
/*
 * Function: get_cpu_total
 * Description:
 *  Get the total number of online cpus on the node.
 *
 * RETURNS
 *  Returns the number of online cpus on the node.  On error, it returns -1.
 *
 * TODO:
 * 	Danny suggests using xcgroup_get_param to read the CPU values instead of
 * 	this function.  Look at the way task/cgroup/task_cgroup_cpuset.c or
 * 	jobacct_gather/cgroup/jobacct_gather_cgroup.c does it.
 */
static int _get_cpu_total(void)
{
	FILE *f = NULL;
	char *token = NULL, *lin = NULL, *saveptr = NULL;
	int total = 0;
	ssize_t lsz;
	size_t sz;
	int matches;
	long int number1, number2;

	f = fopen("/sys/devices/system/cpu/online", "r");

	if (!f) {
		CRAY_ERR("Failed to open file"
			 " /sys/devices/system/cpu/online: %m");
		return -1;
	}

	while (!feof(f)) {
		lsz = getline(&lin, &sz, f);
		if (lsz > 0) {
			// Split into comma-separated tokens
			token = strtok_r(lin, ",", &saveptr);
			while (token) {
				// Check each token for a range
				matches = sscanf(token, "%ld-%ld",
						 &number1, &number2);
				if (matches <= 0) {
					// This token isn't numeric
					CRAY_ERR("Error parsing %s: %m", token);
					free(lin);
					TEMP_FAILURE_RETRY(fclose(f));
					return -1;
				} else if (matches == 1) {
					// Single entry
					total++;
				} else if (number2 > number1) {
					// Range
					total += number2 - number1 + 1;
				} else {
					// Invalid range
					CRAY_ERR("Invalid range %s", token);
					free(lin);
					TEMP_FAILURE_RETRY(fclose(f));
					return -1;
				}
				token = strtok_r(NULL, ",", &saveptr);
			}
		}
	}
	free(lin);
	TEMP_FAILURE_RETRY(fclose(f));
	return total;
}
Ejemplo n.º 5
0
Archivo: ports.c Proyecto: BYUHPC/slurm
/*
 * Function: assign_port
 * Description:
 *  Looks for and assigns the next free port.   This port is used by Cray's
 *  PMI for its communications to manage its control tree.
 *
 *  To avoid port conflicts, this function selects a large range of
 *  ports within the middle of the port range where it assumes no
 *  ports are used.  No special precautions are taken to handle a
 *  selected port already in use by some other non-SLURM component
 *  on the node.
 *
 *  If there are no free ports, then it loops through the entire table
 *  ATTEMPTS number of times before declaring a failure.
 *
 * Returns:
 *  0 on success and -1 on failure.
 */
int assign_port(uint32_t *real_port)
{
	int port, tmp, attempts = 0;

	if (!real_port) {
		CRAY_ERR("real_port address was NULL.");
		return -1;
	}

	/*
	 * Ports is an index into the reserved port table.
	 * The ports range from 0 up to PORT_CNT.
	 */
	pthread_mutex_lock(&port_mutex);
	port = ++last_alloc_port % PORT_CNT;

	/*
	 * Find an unreserved port to assign.
	 * Abandon the attempt if we've been through the available ports ATTEMPT
	 * number of times
	 */
	while (bit_test(port_resv, port)) {
		tmp = ++port % PORT_CNT;
		port = tmp;
		attempts++;
		if ((attempts / PORT_CNT) >= ATTEMPTS) {
			CRAY_ERR("No free ports among %d ports. "
				 "Went through entire port list %d times",
				 PORT_CNT, ATTEMPTS);
			pthread_mutex_unlock(&port_mutex);
			return -1;
		} else if ((attempts % PORT_CNT) == 0) {
			/*
			 * Each time through give other threads a chance
			 * to release ports
			 */
			pthread_mutex_unlock(&port_mutex);
			sleep(1);
			pthread_mutex_lock(&port_mutex);
		}
	}

	bit_set(port_resv, port);
	last_alloc_port = port;
	pthread_mutex_unlock(&port_mutex);

	/*
	 * The port index must be scaled up by the MIN_PORT.
	 */
	*real_port = (port + MIN_PORT);
	return 0;
}
Ejemplo n.º 6
0
/*
 * switch functions for global state save/restore
 */
int switch_p_libstate_save(char *dir_name)
{
#ifdef HAVE_NATIVE_CRAY
    Buf buffer;
    char *file_name;
    int ret = SLURM_SUCCESS;
    int state_fd;

    xassert(dir_name != NULL);

    if (debug_flags & DEBUG_FLAG_SWITCH)
        CRAY_INFO("save to %s", dir_name);

    buffer = init_buf(SWITCH_BUF_SIZE);
    _state_write_buf(buffer);
    file_name = xstrdup(dir_name);
    xstrcat(file_name, "/switch_cray_state");
    (void) unlink(file_name);
    state_fd = creat(file_name, 0600);
    if (state_fd < 0) {
        CRAY_ERR("Can't save state, error creating file %s %m",
                 file_name);
        ret = SLURM_ERROR;
    } else {
        char  *buf = get_buf_data(buffer);
        size_t len = get_buf_offset(buffer);
        while (1) {
            int wrote = write(state_fd, buf, len);
            if ((wrote < 0) && (errno == EINTR))
                continue;
            if (wrote == 0)
                break;
            if (wrote < 0) {
                CRAY_ERR("Can't save switch state: %m");
                ret = SLURM_ERROR;
                break;
            }
            buf += wrote;
            len -= wrote;
        }
        close(state_fd);
    }
    xfree(file_name);

    if (buffer)
        free_buf(buffer);

    return ret;
#else
    return SLURM_SUCCESS;
#endif
}
Ejemplo n.º 7
0
/*
 * Determines the memory scaling amount to use.
 * Returns -1 on failure.
 */
int get_mem_scaling(stepd_step_rec_t *job)
{
	int mem_scaling;
	uint32_t total_mem;

	/*
	 * Get the memory amount
	 */
	total_mem = _get_mem_total();
	if (total_mem == 0) {
		CRAY_ERR("Scanning /proc/meminfo results in MemTotal=0");
		return -1;
	}

	/*
	 * Scale total_mem, which is in kilobytes, to megabytes because
	 * app_mem is in megabytes.
	 * Round to the nearest integer.
	 * If the memory request is greater than 100 percent, then scale
	 * it to 100%.
	 * If the memory request is zero, then return an error.
	 *
	 * Note: Because this has caused some confusion in the past,
	 * The MEM_PER_CPU flag is used to indicate that job->step_mem
	 * is the amount of memory per CPU, not total.  However, this
	 * flag is read and cleared in slurmd prior to passing this
	 * value to slurmstepd.
	 * The value comes to slurmstepd already properly scaled.
	 * Thus, this function does not need to check the MEM_PER_CPU
	 * flag.
	 */
	mem_scaling = ((((double) job->step_mem /
			 ((double) total_mem / 1024)) * (double) 100))
		+ 0.5;
	if (mem_scaling > MAX_SCALING) {
		CRAY_INFO("Memory scaling out of bounds: %d. "
			  "Reducing to %d%%.",
			  mem_scaling, MAX_SCALING);
		mem_scaling = MAX_SCALING;
	}

	if (mem_scaling < MIN_SCALING) {
		CRAY_ERR("Memory scaling out of bounds: %d. "
			 "Increasing to %d%%",
			 mem_scaling, MIN_SCALING);
		mem_scaling = MIN_SCALING;
	}

	return mem_scaling;
}
Ejemplo n.º 8
0
/*
 * Determines the cpu scaling amount to use.
 * Returns -1 on failure.
 */
int get_cpu_scaling(stepd_step_rec_t *job)
{
	int total_cpus, num_app_cpus, cpu_scaling;

	/*
	 *  Get the number of CPUs on the node
	 */
	total_cpus = _get_cpu_total();
	if (total_cpus <= 0) {
		CRAY_ERR("total_cpus <= 0: %d", total_cpus);
		return -1;
	}

	/*
	 * If the submission didn't come from srun (API style)
	 * perhaps they didn't fill in things correctly.
	 */
	if (!job->cpus_per_task) {
		job->cpus_per_task = 1;
	}

	/*
	 * Determine number of CPUs requested for the step
	 */
	num_app_cpus = job->cpus;
	if (num_app_cpus <= 0) {
		num_app_cpus = job->node_tasks * job->cpus_per_task;
		if (num_app_cpus <= 0) {
			CRAY_ERR("num_app_cpus <= 0: %d", num_app_cpus);
			return -1;
		}
	}

	/*
	 * Determine what percentage of the CPUs were requested
	 */
	cpu_scaling = (((double) num_app_cpus / (double) total_cpus) *
		       (double) 100) + 0.5;
	if (cpu_scaling > MAX_SCALING) {
		debug("Cpu scaling out of bounds: %d. Reducing to %d%%",
			 cpu_scaling, MAX_SCALING);
		cpu_scaling = MAX_SCALING;
	} else if (cpu_scaling < MIN_SCALING) {
		CRAY_ERR("Cpu scaling out of bounds: %d. Increasing to %d%%",
			 cpu_scaling, MIN_SCALING);
		cpu_scaling = MIN_SCALING;
	}
	return cpu_scaling;
}
Ejemplo n.º 9
0
Archivo: gpu.c Proyecto: BYUHPC/slurm
/*
 * Search the job's environment to determine if the
 * user requested the MPS to be on or off.
 * Returns 0 for off, 1 for on, 2 for not requested,
 * 3 for error.
 */
static int _get_mps_request(stepd_step_rec_t *job)
{

        char *envval;

	// Determine what user wants the mps to be set at by the
	// CRAY_CUDA_MPS and CRAY_CUDA_PROXY variables. If not set,
	// do nothing.
	if (!(envval = getenvp(job->env, CRAY_CUDA_MPS_ENV)) &&
	    !(envval = getenvp(job->env, CRAY_CUDA_PROXY_ENV))) {
		debug2("No GPU action requested");
		return 2;
	}

	if (!strcasecmp(envval, "on") || !strcmp(envval, "1")) {
		debug2("GPU mps requested on");
		return 1;
	} else if (!strcasecmp(envval, "off") || !strcmp(envval, "0")) {
		debug2("GPU mps requested off");
		return 0;
	}

	CRAY_ERR("Couldn't parse %s value %s, expected on,off,0,1",
		 CRAY_CUDA_MPS_ENV, envval);
	return 3;
}
/*
 * init() is called when the plugin is loaded, before any other functions
 *	are called.  Put global initialization here.
 */
extern int init (void)
{
	debug("%s loaded.", plugin_name);

#ifdef HAVE_NATIVE_CRAY
	int rc;
	struct stat st;

	debug_flags = slurm_get_debug_flags();

	// Create the run directory
	errno = 0;
	rc = mkdir(TASK_CRAY_RUN_DIR, 0755);
	if (rc == -1 &&	errno != EEXIST) {
		CRAY_ERR("Couldn't create %s: %m", TASK_CRAY_RUN_DIR);
		return SLURM_ERROR;
	}

	// Determine whether to track app status with LLI
	rc = stat(LLI_SPOOL_DIR, &st);
	if (rc == -1) {
		debug("stat %s failed, disabling exit status tracking: %m",
			LLI_SPOOL_DIR);
		track_status = 0;
	} else {
		track_status = 1;
	}
#endif

	return SLURM_SUCCESS;
}
Ejemplo n.º 11
0
/*
 * Get the total amount of memory on the node.
 * Returns 0 on failure.
 */
static uint32_t _get_mem_total(void)
{
	FILE *f = NULL;
	size_t sz = 0;
	ssize_t lsz = 0;
	char *lin = NULL;
	int meminfo_value;
	char meminfo_str[1024];
	uint32_t total_mem = 0;

	f = fopen("/proc/meminfo", "r");
	if (f == NULL ) {
		CRAY_ERR("Failed to open /proc/meminfo: %m");
		return 0;
	}

	while (!feof(f)) {
		lsz = getline(&lin, &sz, f);
		if (lsz > 0) {
			sscanf(lin, "%s %d", meminfo_str,
			       &meminfo_value);
			if (!strcmp(meminfo_str, "MemTotal:")) {
				total_mem = meminfo_value;
				break;
			}
		}
	}
	free(lin);
	TEMP_FAILURE_RETRY(fclose(f));
	return total_mem;
}
Ejemplo n.º 12
0
/*
 * Get a peCmdMapArray, or NULL on error
 */
static int *_get_cmd_map(stepd_step_rec_t *job)
{
	size_t size;
	int cmd_index, i, pe;
	int *cmd_map = NULL;

	size = job->ntasks * sizeof(int);
	cmd_map = xmalloc(size);
	if (job->mpmd_set) {
		// Multiple programs, fill in from mpmd_set information
		for (i = 0; i < job->ntasks; i++) {
			cmd_map[i] = -1;
		}

		// Loop over the MPMD commands
		for (cmd_index = 0;
		     cmd_index < job->mpmd_set->num_cmds; cmd_index++) {

			// Fill in start_pe to start_pe+total_pe
			for (i = 0, pe = job->mpmd_set->start_pe[cmd_index];
			     i < job->mpmd_set->total_pe[cmd_index];
			     i++, pe++) {
				if (pe >= job->ntasks) {
					CRAY_ERR("PE index %d too large", pe);
					xfree(cmd_map);
					return NULL;
				}
				cmd_map[pe] = cmd_index;
			}
		}

		// Verify the entire array was filled
		for (pe = 0; pe < job->ntasks; pe++) {
			if (cmd_map[pe] == -1) {
				CRAY_ERR("No command on PE index %d", pe);
				xfree(cmd_map);
				return NULL;
			}
		}
	} else {
		// Only one program, index 0
		memset(cmd_map, 0, size);
	}

	return cmd_map;
}
/*
 * If it wasn't created already, make the LLI_STATUS_FILE with given owner
 * and group, permissions 644, with given size
 */
static int _make_status_file(stepd_step_rec_t *job)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rv, fd;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Make the file
	errno = 0;
	fd = open(llifile, O_CREAT|O_EXCL|O_WRONLY, 0644);
	if (fd == -1) {
		// Another task_p_pre_launch_priv already created it, ignore
		if (errno == EEXIST) {
			return SLURM_SUCCESS;
		}
		CRAY_ERR("creat(%s) failed: %m", llifile);
		return SLURM_ERROR;
	}

	// Resize it
	rv = ftruncate(fd, job->node_tasks + 1);
	if (rv == -1) {
		CRAY_ERR("ftruncate(%s) failed: %m", llifile);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Change owner/group so app can write to it
	rv = fchown(fd, job->uid, job->gid);
	if (rv == -1) {
		CRAY_ERR("chown(%s) failed: %m", llifile);
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}
	info("Created file %s", llifile);

	TEMP_FAILURE_RETRY(close(fd));
	return SLURM_SUCCESS;
}
/*
 * task_p_pre_launch() is called prior to exec of application task.
 *	It is followed by TaskProlog program (from slurm.conf) and
 *	--task-prolog (from srun command line).
 */
extern int task_p_pre_launch (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	int rc;

	debug("task_p_pre_launch: %u.%u, task %d",
	      job->jobid, job->stepid, job->envtp->procid);

	/*
	 * Send the rank to the application's PMI layer via an environment
	 * variable.
	 */
	rc = env_array_overwrite_fmt(&job->env, ALPS_APP_PE_ENV,
				     "%d", job->envtp->procid);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s", ALPS_APP_PE_ENV);
		return SLURM_ERROR;
	}

	/*
	 * Set the PMI_NO_FORK environment variable.
	 */
	rc = env_array_overwrite(&job->env, PMI_NO_FORK_ENV, "1");
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s", PMI_NO_FORK_ENV);
		return SLURM_ERROR;
	}

	/*
	 *  Notify the task which offset to use
	 */
	rc = env_array_overwrite_fmt(&job->env, LLI_STATUS_OFFS_ENV,
				     "%d", job->envtp->localid + 1);
	if (rc == 0) {
		CRAY_ERR("Failed to set env variable %s",
			 LLI_STATUS_OFFS_ENV);
		return SLURM_ERROR;
	}
#endif
	return SLURM_SUCCESS;
}
Ejemplo n.º 15
0
/*
 * init() is called when the plugin is loaded, before any other functions
 * are called.  Put global initialization here.
 */
int init(void)
{
    verbose("%s loaded.", plugin_name);
    debug_flags = slurm_get_debug_flags();
#ifdef HAVE_NATIVE_CRAY
    if (MAX_PORT < MIN_PORT) {
        CRAY_ERR("MAX_PORT: %d < MIN_PORT: %d", MAX_PORT, MIN_PORT);
        return SLURM_ERROR;
    }
#endif
    return SLURM_SUCCESS;
}
Ejemplo n.º 16
0
/*
 * Write the IAA file and set the filename in the job's environment
 */
int write_iaa_file(stepd_step_rec_t *job, slurm_cray_jobinfo_t *sw_job,
		   int *ptags, int num_ptags, alpsc_peInfo_t *alpsc_pe_info)
{
	char *fname = xstrdup_printf(CRAY_IAA_FILE, sw_job->apid);
	int rc, ret = SLURM_ERROR;
	char *err_msg = NULL;

	do {
		// Write the file
		rc = alpsc_write_iaa_info(&err_msg, fname, sw_job->num_cookies,
					  (const char **)sw_job->cookies,
					  num_ptags, ptags, alpsc_pe_info);
		ALPSC_CN_DEBUG("alpsc_write_iaa_info");
		if (rc != 1) {
			break;
		}

		// chown the file to the job user
		rc = chown(fname, job->uid, job->gid);
		if (rc == -1) {
			CRAY_ERR("chown(%s, %d, %d) failed: %m",
				 fname, (int)job->uid, (int)job->gid);
			break;
		}

		// Write the environment variable
		rc = env_array_overwrite(&job->env, CRAY_IAA_INFO_FILE_ENV,
					 fname);
		if (rc == 0) {
			CRAY_ERR("Failed to set env variable %s",
				 CRAY_IAA_INFO_FILE_ENV);
			break;
		}
		ret = SLURM_SUCCESS;
	} while(0);

	xfree(fname);
	return ret;
}
Ejemplo n.º 17
0
static void _state_read_buf(Buf buffer)
{
    uint16_t protocol_version = (uint16_t) NO_VAL;
    uint32_t min_port, max_port;
    int i;

    /* Validate state version */
    safe_unpack16(&protocol_version, buffer);
    debug3("Version in switch_cray header is %u", protocol_version);
    if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
        error("******************************************************");
        error("Can't recover switch/cray state, incompatible version");
        error("******************************************************");
        return;
    }
    if (protocol_version >= SLURM_14_11_PROTOCOL_VERSION) {
        safe_unpack32(&min_port, buffer);
        safe_unpack32(&max_port, buffer);
        safe_unpack32(&last_alloc_port, buffer);
        unpack_bit_str(&port_resv, buffer);
    } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
        uint8_t port_set = 0;
        safe_unpack32(&min_port, buffer);
        safe_unpack32(&max_port, buffer);
        safe_unpack32(&last_alloc_port, buffer);
        port_resv = bit_alloc(PORT_CNT);
        for (i = 0; i < PORT_CNT; i++) {
            safe_unpack8(&port_set, buffer);
            if (port_set)
                bit_set(port_resv, i);
        }
    }
    if ((min_port != MIN_PORT) || (max_port != MAX_PORT)) {
        error("******************************************************");
        error("Can not recover switch/cray state");
        error("Changed MIN_PORT (%u != %u) and/or MAX_PORT (%u != %u)",
              min_port, MIN_PORT, max_port, MAX_PORT);
        error("******************************************************");
        return;
    }

    return;

unpack_error:
    CRAY_ERR("unpack error");
    return;
}
Ejemplo n.º 18
0
/*
 * Get the command index. Note this is incompatible with MPMD so for now
 * we'll just return one of the command indices on this node.
 * Returns -1 if no command is found on this node.
 */
static int _get_cmd_index(stepd_step_rec_t *job)
{
	int cmd_index;

	if (job->mpmd_set && job->mpmd_set->first_pe) {
		// Use the first index found in the list
		for (cmd_index = 0; cmd_index < job->mpmd_set->num_cmds;
		     cmd_index++) {
			if (job->mpmd_set->first_pe[cmd_index] != -1) {
				return cmd_index;
			}
		}
		// If we've made it here we didn't find any on this node
		CRAY_ERR("No command found on this node");
		return -1;
	}

	// Not an MPMD job, the one command has index 0
	return 0;
}
Ejemplo n.º 19
0
/*
 * init() is called when the plugin is loaded, before any other functions
 *	are called.  Put global initialization here.
 */
extern int init (void)
{
	debug("%s loaded.", plugin_name);

	char *task_plugin = slurm_get_task_plugin();
	char *task_cgroup = strstr(task_plugin, "cgroup");
	char *task_cray = strstr(task_plugin, "cray");

	if (!task_cgroup || !task_cray || task_cgroup < task_cray)
		fatal("task/cgroup must be used with, and listed after, "
		      "task/cray in TaskPlugin");

	xfree(task_plugin);

#ifdef HAVE_NATIVE_CRAY
	int rc;
	struct stat st;

	debug_flags = slurm_get_debug_flags();

	// Create the run directory
	errno = 0;
	rc = mkdir(TASK_CRAY_RUN_DIR, 0755);
	if (rc == -1 &&	errno != EEXIST) {
		CRAY_ERR("Couldn't create %s: %m", TASK_CRAY_RUN_DIR);
		return SLURM_ERROR;
	}

	// Determine whether to track app status with LLI
	rc = stat(LLI_SPOOL_DIR, &st);
	if (rc == -1) {
		debug("stat %s failed, disabling exit status tracking: %m",
			LLI_SPOOL_DIR);
		track_status = 0;
	} else {
		track_status = 1;
	}
#endif

	return SLURM_SUCCESS;
}
/*
 * Update the number of running steps on the node
 * Set val to 1 to increment and -1 to decrement the value
 * Returns the new value, or -1 on error
 */
static int _update_num_steps(int val)
{
	int rc, fd, num_steps = 0;
	ssize_t size;
	off_t offset;
	struct flock lock;

	// Sanity check the argument
	if (val != 1 && val != -1) {
		CRAY_ERR("invalid val %d", val);
		return -1;
	}

	// Open the file
	fd = open(NUM_STEPS_FILE, O_RDWR | O_CREAT, 0644);
	if (fd == -1) {
		CRAY_ERR("open failed: %m");
		return -1;
	}

	// Exclusive lock on the first byte of the file
	// Automatically released when the file descriptor is closed
	lock.l_type = F_WRLCK;
	lock.l_whence = SEEK_SET;
	lock.l_start = 0;
	lock.l_len = sizeof(int);
	rc = fcntl(fd, F_SETLKW, &lock);
	if (rc == -1) {
		CRAY_ERR("fcntl failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return -1;
	}

	// Read the value
	size = read(fd, &num_steps, sizeof(int));
	if (size == -1) {
		CRAY_ERR("read failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return -1;
	} else if (size == 0) {
		// Value doesn't exist, must be the first step
		num_steps = 0;
	}

	// Increment or decrement and check result
	num_steps += val;
	if (num_steps < 0) {
		CRAY_ERR("Less than 0 steps on the node");
		TEMP_FAILURE_RETRY(close(fd));
		return 0;
	}

	// Write the new value
	offset = lseek(fd, 0, SEEK_SET);
	if (offset == -1) {
		CRAY_ERR("fseek failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return -1;
	}
	size = write(fd, &num_steps, sizeof(int));
	if (size < sizeof(int)) {
		CRAY_ERR("write failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return -1;
	}
	if (debug_flags & DEBUG_FLAG_TASK) {
		debug("Wrote %d steps to %s", num_steps, NUM_STEPS_FILE);
	}

	TEMP_FAILURE_RETRY(close(fd));
	return num_steps;
}
static int _get_cpu_masks(int num_numa_nodes, int32_t *numa_array,
			  cpu_set_t **cpuMasks) {

	struct bitmask **remaining_numa_node_cpus = NULL, *collective;
	unsigned long **numa_node_cpus = NULL;
	int i, j, at_least_one_cpu = 0, rc = 0;
	cpu_set_t *cpusetptr;
	char *bitmask_str = NULL;

	if (numa_available()) {
		CRAY_ERR("Libnuma not available");
		return -1;
	}

	/*
	 * numa_node_cpus: The CPUs available to the NUMA node.
	 * numa_all_cpus_ptr: all CPUs on which the calling task may execute.
	 * remaining_numa_node_cpus: Bitwise-AND of the above two to get all of
	 *                           the CPUs that the task can run on in this
	 *                           NUMA node.
	 * collective: Collects all of the CPUs as a precaution.
	 */
	remaining_numa_node_cpus = xmalloc(num_numa_nodes *
					   sizeof(struct bitmask *));
	collective = numa_allocate_cpumask();
	numa_node_cpus = xmalloc(num_numa_nodes * sizeof(unsigned long*));
	for (i = 0; i < num_numa_nodes; i++) {
		remaining_numa_node_cpus[i] = numa_allocate_cpumask();
		numa_node_cpus[i] = xmalloc(sizeof(unsigned long) *
					    NUM_INTS_TO_HOLD_ALL_CPUS);
		rc = numa_node_to_cpus(numa_array[i], numa_node_cpus[i],
				       NUM_INTS_TO_HOLD_ALL_CPUS);
		if (rc) {
			CRAY_ERR("numa_node_to_cpus failed: Return code %d",
				 rc);
		}
		for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) {
			(remaining_numa_node_cpus[i]->maskp[j]) =
				(numa_node_cpus[i][j]) &
				(numa_all_cpus_ptr->maskp[j]);
			collective->maskp[j] |=
				(remaining_numa_node_cpus[i]->maskp[j]);
		}
	}

	/*
	 * Ensure that we have not masked off all of the CPUs.
	 * If we have, just re-enable them all.  Better to clear them all than
	 * none of them.
	 */
	for (j = 0; j < collective->size; j++) {
		if (numa_bitmask_isbitset(collective, j)) {
			at_least_one_cpu = 1;
		}
	}

	if (!at_least_one_cpu) {
		for (i = 0; i < num_numa_nodes; i++) {
			for (j = 0; j <
				     (remaining_numa_node_cpus[i]->size /
				      (sizeof(unsigned long) * 8));
			     j++) {
				(remaining_numa_node_cpus[i]->maskp[j]) =
					(numa_all_cpus_ptr->maskp[j]);
			}
		}
	}

	if (debug_flags & DEBUG_FLAG_TASK) {
		bitmask_str = NULL;
		for (i = 0; i < num_numa_nodes; i++) {
			for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) {
				xstrfmtcat(bitmask_str, "%6lx ",
					   numa_node_cpus[i][j]);
			}
		}
		info("%sBitmask: Allowed CPUs for NUMA Node", bitmask_str);
		xfree(bitmask_str);
		bitmask_str = NULL;

		for (i = 0; i < num_numa_nodes; i++) {
			for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) {
				xstrfmtcat(bitmask_str, "%6lx ",
					  numa_all_cpus_ptr->maskp[j]);
			}
		}
		info("%sBitmask: Allowed CPUs for cpuset", bitmask_str);
		xfree(bitmask_str);
		bitmask_str = NULL;

		for (i = 0; i < num_numa_nodes; i++) {
			for (j = 0; j < NUM_INTS_TO_HOLD_ALL_CPUS; j++) {
				xstrfmtcat(bitmask_str, "%6lx ",
					   remaining_numa_node_cpus[i]->
					   maskp[j]);
			}
		}
		info("%sBitmask: Allowed CPUs between cpuset and NUMA Node",
		     bitmask_str);
		xfree(bitmask_str);
	}


	// Convert bitmasks to cpu_set_t types
	cpusetptr = xmalloc(num_numa_nodes * sizeof(cpu_set_t));

	for (i = 0; i < num_numa_nodes; i++) {
		CPU_ZERO(&cpusetptr[i]);
		for (j = 0; j < remaining_numa_node_cpus[i]->size; j++) {
			if (numa_bitmask_isbitset(remaining_numa_node_cpus[i],
						  j)) {
				CPU_SET(j, &cpusetptr[i]);
			}
		}
		if (debug_flags & DEBUG_FLAG_TASK) {
			info("CPU_COUNT() of set: %d",
			     CPU_COUNT(&cpusetptr[i]));
		}
	}

	*cpuMasks = cpusetptr;

	// Freeing Everything
	numa_free_cpumask(collective);
	for (i = 0; i < num_numa_nodes; i++) {
		xfree(numa_node_cpus[i]);
		numa_free_cpumask(remaining_numa_node_cpus[i]);
	}
	xfree(numa_node_cpus);
	xfree(numa_node_cpus);
	xfree(remaining_numa_node_cpus);

	return 0;
}
/*
 * task_p_post_step() is called after termination of the step
 * (all the tasks)
 */
extern int task_p_post_step (stepd_step_rec_t *job)
{
#ifdef HAVE_NATIVE_CRAY
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	int rc, cnt;
	char *err_msg = NULL, path[PATH_MAX];
	int32_t *numa_nodes;
	cpu_set_t *cpuMasks;

	if (track_status) {
		// Get the lli file name
		snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
			 SLURM_ID_HASH(job->jobid, job->stepid));

		// Unlink the file
		errno = 0;
		rc = unlink(llifile);
		if (rc == -1 && errno != ENOENT) {
			CRAY_ERR("unlink(%s) failed: %m", llifile);
		} else if (rc == 0) {
			info("Unlinked %s", llifile);
		}
	}

	/*
	 * Compact Memory
	 *
	 * Determine which NUMA nodes and CPUS an application is using.  It will
	 * be used to compact the memory.
	 *
	 * You'll find the information in the following location.
	 * For a normal job step:
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_<stepID>/
	 *
	 * For a batch job step (only on the head node and only for batch jobs):
	 * /dev/cpuset/slurm/uid_<uid>/job_<jobID>/step_batch/
	 *
	 * NUMA node: mems
	 * CPU Masks: cpus
	 */
	if (job->batch) {
		// Batch Job Step
		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_batch", job->uid, job->jobid);
		if (rc < 0) {
			CRAY_ERR("snprintf failed. Return code: %d", rc);
			return SLURM_ERROR;
		}
	} else {
		// Normal Job Step

		/* Only run epilogue on non-batch steps */
		_step_epilogue();

		rc = snprintf(path, sizeof(path),
			      "/dev/cpuset/slurm/uid_%d/job_%"
			      PRIu32 "/step_%" PRIu32,
			      job->uid, job->jobid, job->stepid);
		if (rc < 0) {
			CRAY_ERR("snprintf failed. Return code: %d", rc);
			return SLURM_ERROR;
		}
	}

	rc = _get_numa_nodes(path, &cnt, &numa_nodes);
	if (rc < 0) {
		CRAY_ERR("get_numa_nodes failed. Return code: %d", rc);
		return SLURM_ERROR;
	}

	rc = _get_cpu_masks(cnt, numa_nodes, &cpuMasks);
	if (rc < 0) {
		CRAY_ERR("get_cpu_masks failed. Return code: %d", rc);
		return SLURM_ERROR;
	}

	/*
	 * Compact Memory
	 * The last argument which is a path to the cpuset directory has to be
	 * NULL because the CPUSET directory has already been cleaned up.
	 */
	rc = alpsc_compact_mem(&err_msg, cnt, numa_nodes, cpuMasks, NULL);
	_ALPSC_DEBUG("alpsc_compact_mem");

	xfree(numa_nodes);
	xfree(cpuMasks);

	if (rc != 1) {
		return SLURM_ERROR;
	}
#endif
	return SLURM_SUCCESS;
}
Ejemplo n.º 23
0
/*
 * Check the status file for the exit of the given local task id
 * and terminate the job step if an improper exit is found
 */
static int _check_status_file(stepd_step_rec_t *job)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;
	stepd_step_task_info_t *task;
	char *reason;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		CRAY_ERR("open(%s) failed: %m", llifile);
		return SLURM_ERROR;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		CRAY_ERR("lseek failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0 && !terminated) {
		task = job->task[job->envtp->localid];
		if (task->killed_by_cmd) {
			// We've been killed by request. User already knows
			return SLURM_SUCCESS;
		} else if (task->aborted) {
			reason = "aborted";
		} else if (WIFSIGNALED(task->estatus)) {
			reason = "signaled";
		} else {
			reason = "exited";
		}

		// Cancel the job step, since we didn't find the exiting msg
		error("Terminating job step %"PRIu32".%"PRIu32
			"; task %d exit code %d %s without notification",
			job->jobid, job->stepid, task->gtid,
			WEXITSTATUS(task->estatus), reason);
		terminated = 1;
		slurm_terminate_job_step(job->jobid, job->stepid);
	}
	return SLURM_SUCCESS;
}
Ejemplo n.º 24
0
/*
 * Fill in an alpsc_peInfo_t structure
 */
int build_alpsc_pe_info(stepd_step_rec_t *job,
			alpsc_peInfo_t *alpsc_pe_info, int *cmd_index)
{
	// Sanity check everything here so we don't need to
	// do it everywhere else
	if (job == NULL) {
		CRAY_ERR("NULL job pointer");
		return SLURM_ERROR;
	} else if (job->ntasks < 1) {
		CRAY_ERR("Not enough tasks %d", job->ntasks);
		return SLURM_ERROR;
	} else if (alpsc_pe_info == NULL) {
		CRAY_ERR("NULL alpsc_pe_info");
		return SLURM_ERROR;
	} else if (cmd_index == NULL) {
		CRAY_ERR("NULL cmd_index");
		return SLURM_ERROR;
	} else if (job->multi_prog) {
		if (job->mpmd_set == NULL) {
			CRAY_ERR("MPMD launch but no mpmd_set");
			return SLURM_ERROR;
		} else if (job->mpmd_set->first_pe == NULL) {
			CRAY_ERR("NULL first_pe");
			return SLURM_ERROR;
		} else if (job->mpmd_set->start_pe == NULL) {
			CRAY_ERR("NULL start_pe");
			return SLURM_ERROR;
		} else if (job->mpmd_set->total_pe == NULL) {
			CRAY_ERR("NULL total_pe");
			return SLURM_ERROR;
		} else if (job->mpmd_set->placement == NULL) {
			CRAY_ERR("NULL placement");
			return SLURM_ERROR;
		} else if (job->mpmd_set->num_cmds < 1) {
			CRAY_ERR("Not enough commands %d",
				 job->mpmd_set->num_cmds);
			return SLURM_ERROR;
		}
	}

	// Fill in the structure
	alpsc_pe_info->totalPEs = job->ntasks;
	alpsc_pe_info->firstPeHere = _get_first_pe(job);
	alpsc_pe_info->pesHere = job->node_tasks;
	alpsc_pe_info->peDepth = job->cpus_per_task;
	alpsc_pe_info->peNidArray = _get_pe_nid_map(job);
	alpsc_pe_info->peCmdMapArray = _get_cmd_map(job);
	alpsc_pe_info->nodeCpuArray = _get_node_cpu_map(job);

	// Get the command index
	*cmd_index = _get_cmd_index(job);

	// Check results
	if (alpsc_pe_info->peNidArray == NULL ||
	    alpsc_pe_info->peCmdMapArray == NULL ||
	    alpsc_pe_info->nodeCpuArray == NULL || *cmd_index == -1) {
		free_alpsc_pe_info(alpsc_pe_info);
		return SLURM_ERROR;
	}

	// Print pe info if debug flag is set
	if (debug_flags & DEBUG_FLAG_SWITCH) {
		_print_alpsc_pe_info(alpsc_pe_info, *cmd_index);
	}

	return SLURM_SUCCESS;
}
/*
 * Check the status file for the exit of the given local task id
 * and terminate the job step if an improper exit is found
 */
static int _check_status_file(stepd_step_rec_t *job,
			      stepd_step_task_info_t *task)
{
	char llifile[LLI_STATUS_FILE_BUF_SIZE];
	char status;
	int rv, fd;

	debug("task_p_post_term: %u.%u, task %d",
	      job->jobid, job->stepid, job->envtp->procid);

	// We only need to special case termination with exit(0)
	// srun already handles abnormal exit conditions fine
	if (!WIFEXITED(task->estatus) || (WEXITSTATUS(task->estatus) != 0))
		return SLURM_SUCCESS;

	// Get the lli file name
	snprintf(llifile, sizeof(llifile), LLI_STATUS_FILE,
		 SLURM_ID_HASH(job->jobid, job->stepid));

	// Open the lli file.
	fd = open(llifile, O_RDONLY);
	if (fd == -1) {
		// There's a timing issue for large jobs; this file could
		// already be cleaned up by the time we get here.
		// However, this is during a normal cleanup so no big deal.
		debug("open(%s) failed: %m", llifile);
		return SLURM_SUCCESS;
	}

	// Read the first byte (indicates starting)
	rv = read(fd, &status, sizeof(status));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_ERROR;
	}

	// If the first byte is 0, we either aren't an MPI app or
	// it didn't make it past pmi_init, in any case, return success
	if (status == 0) {
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_SUCCESS;
	}

	// Seek to the correct offset
	rv = lseek(fd, job->envtp->localid + 1, SEEK_SET);
	if (rv == -1) {
		CRAY_ERR("lseek failed: %m");
		TEMP_FAILURE_RETRY(close(fd));
		return SLURM_ERROR;
	}

	// Read the exiting byte
	rv = read(fd, &status, sizeof(status));
	TEMP_FAILURE_RETRY(close(fd));
	if (rv == -1) {
		CRAY_ERR("read failed: %m");
		return SLURM_SUCCESS;
	}

	// Check the result
	if (status == 0) {
		if (task->killed_by_cmd) {
			// We've been killed by request. User already knows
			return SLURM_SUCCESS;
		}

		verbose("step %u.%u task %u exited without calling "
			"PMI_Finalize()",
			job->jobid, job->stepid, task->gtid);
	}
	return SLURM_SUCCESS;
}
Ejemplo n.º 26
0
int switch_p_libstate_restore(char *dir_name, bool recover)
{
#ifdef HAVE_NATIVE_CRAY
    char *data = NULL, *file_name;
    Buf buffer = NULL;
    int error_code = SLURM_SUCCESS;
    int state_fd, data_allocated = 0, data_read = 0, data_size = 0;

    xassert(dir_name != NULL);

    if (debug_flags & DEBUG_FLAG_SWITCH) {
        CRAY_INFO("restore from %s, recover %d",
                  dir_name,  (int) recover);
    }

    if (!recover)		/* clean start, no recovery */
        return SLURM_SUCCESS;

    file_name = xstrdup(dir_name);
    xstrcat(file_name, "/switch_cray_state");
    state_fd = open (file_name, O_RDONLY);
    if (state_fd >= 0) {
        data_allocated = SWITCH_BUF_SIZE;
        data = xmalloc(data_allocated);
        while (1) {
            data_read = read (state_fd, &data[data_size],
                              SWITCH_BUF_SIZE);
            if ((data_read < 0) && (errno == EINTR))
                continue;
            if (data_read < 0) {
                CRAY_ERR("Read error on %s, %m", file_name);
                error_code = SLURM_ERROR;
                break;
            } else if (data_read == 0)
                break;
            data_size      += data_read;
            data_allocated += data_read;
            xrealloc(data, data_allocated);
        }
        close (state_fd);
        (void) unlink(file_name);	/* One chance to recover */
        xfree(file_name);
    } else {
        CRAY_ERR("No %s file for switch/cray state recovery",
                 file_name);
        CRAY_ERR("Starting switch/cray with clean state");
        xfree(file_name);
        return SLURM_SUCCESS;
    }

    if (error_code == SLURM_SUCCESS) {
        buffer = create_buf (data, data_size);
        data = NULL;	/* now in buffer, don't xfree() */
        _state_read_buf(buffer);
    }

    if (buffer)
        free_buf(buffer);
    xfree(data);
#endif
    return SLURM_SUCCESS;
}
/*
 * Function: _get_numa_nodes
 * Description:
 *  Returns a count of the NUMA nodes that the application is running on.
 *
 *  Returns an array of NUMA nodes that the application is running on.
 *
 *
 *  IN char* path -- The path to the directory containing the files containing
 *                   information about NUMA nodes.
 *
 *  OUT *cnt -- The number of NUMA nodes in the array
 *  OUT **numa_array -- An integer array containing the NUMA nodes.
 *                      This array must be xfreed by the caller.
 *
 * RETURN
 *  0 on success and -1 on failure.
 */
static int _get_numa_nodes(char *path, int *cnt, int32_t **numa_array) {
	struct bitmask *bm;
	int i, index, rc = 0;
	int lsz;
	size_t sz;
	char buffer[PATH_MAX];
	FILE *f = NULL;
	char *lin = NULL;

	rc = snprintf(buffer, sizeof(buffer), "%s/%s", path, "mems");
	if (rc < 0) {
		CRAY_ERR("snprintf failed. Return code: %d", rc);
	}

	f = fopen(buffer, "r");
	if (f == NULL ) {
		CRAY_ERR("Failed to open file %s: %m", buffer);
		return -1;
	}

	lsz = getline(&lin, &sz, f);
	if (lsz > 0) {
		if (lin[strlen(lin) - 1] == '\n') {
			lin[strlen(lin) - 1] = '\0';
		}
		bm = numa_parse_nodestring(lin);
		if (bm == NULL ) {
			CRAY_ERR("Error numa_parse_nodestring:"
				 " Invalid node string: %s", lin);
			free(lin);
			return SLURM_ERROR;
		}
	} else {
		CRAY_ERR("Reading %s failed", buffer);
		return SLURM_ERROR;
	}
	free(lin);

	*cnt = numa_bitmask_weight(bm);
	if (*cnt == 0) {
		CRAY_ERR("No NUMA Nodes found");
		return -1;
	}

	if (debug_flags & DEBUG_FLAG_TASK) {
		info("Bitmask %#lx size: %lu sizeof(*(bm->maskp)): %zd"
		     " weight: %u",
		     *(bm->maskp), bm->size, sizeof(*(bm->maskp)), *cnt);
	}

	*numa_array = xmalloc(*cnt * sizeof(int32_t));

	index = 0;
	for (i = 0; i < bm->size; i++) {
		if (*(bm->maskp) & ((long unsigned) 1 << i)) {
			if (debug_flags & DEBUG_FLAG_TASK) {
				info("(%s: %d: %s) NUMA Node %d is present",
				     THIS_FILE,	__LINE__, __FUNCTION__, i);
			}
			(*numa_array)[index++] = i;
		}
	}

	numa_free_nodemask(bm);

	return 0;
}
Ejemplo n.º 28
0
static void _state_read_buf(Buf buffer)
{
	uint16_t protocol_version = (uint16_t) NO_VAL;
	uint32_t min_port, max_port;
	int i;

	/* Validate state version */
	safe_unpack16(&protocol_version, buffer);
	debug3("Version in switch_cray header is %u", protocol_version);
	if (protocol_version < SLURM_MIN_PROTOCOL_VERSION) {
		error("******************************************************");
		error("Can't recover switch/cray state, incompatible version");
		error("******************************************************");
		return;
	}

	pthread_mutex_lock(&port_mutex);
	if (protocol_version >= SLURM_14_11_PROTOCOL_VERSION) {
		safe_unpack32(&min_port, buffer);
		safe_unpack32(&max_port, buffer);
		safe_unpack32(&last_alloc_port, buffer);
		/* make sure we are NULL here */
		FREE_NULL_BITMAP(port_resv);
		unpack_bit_str_hex(&port_resv, buffer);
	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
		uint8_t port_set = 0;
		safe_unpack32(&min_port, buffer);
		safe_unpack32(&max_port, buffer);
		safe_unpack32(&last_alloc_port, buffer);
		/* make sure we are NULL here */
		FREE_NULL_BITMAP(port_resv);
		port_resv = bit_alloc(PORT_CNT);
		for (i = 0; i < PORT_CNT; i++) {
			safe_unpack8(&port_set, buffer);
			if (port_set)
				bit_set(port_resv, i);
		}
	}

	if (!port_resv || (bit_size(port_resv) != PORT_CNT)) {
		error("_state_read_buf: Reserve Port size was %d not %d, "
		      "reallocating",
		      port_resv ? bit_size(port_resv) : -1, PORT_CNT);
		port_resv = bit_realloc(port_resv, PORT_CNT);
	}
	pthread_mutex_unlock(&port_mutex);

	if ((min_port != MIN_PORT) || (max_port != MAX_PORT)) {
		error("******************************************************");
		error("Can not recover switch/cray state");
		error("Changed MIN_PORT (%u != %u) and/or MAX_PORT (%u != %u)",
		      min_port, MIN_PORT, max_port, MAX_PORT);
		error("******************************************************");
		return;
	}

	return;

unpack_error:
	CRAY_ERR("unpack error");
	return;
}
Ejemplo n.º 29
0
/*
 * Get the pe to nid map, or NULL on error
 */
static int *_get_pe_nid_map(stepd_step_rec_t *job)
{
	size_t size;
	int *pe_nid_map = NULL;
	int cnt = 0, task, i, j, rc;
	int32_t *nodes = NULL;
	int tasks_to_launch_sum, nid;

	size = job->ntasks * sizeof(int);
	pe_nid_map = xmalloc(size);

	// If we have it, just copy the mpmd set information
	if (job->mpmd_set && job->mpmd_set->placement) {
		// mpmd_set->placement is an int * too so this works
		memcpy(pe_nid_map, job->mpmd_set->placement, size);
	} else {
		// Initialize to -1 so we can tell if we missed any
		for (i = 0; i < job->ntasks; i++) {
			pe_nid_map[i] = -1;
		}

		// Convert the node list to an array of nids
		rc = list_str_to_array(job->msg->complete_nodelist, &cnt,
				       &nodes);
		if (rc < 0) {
			xfree(pe_nid_map);
			return NULL;
		} else if (job->nnodes != cnt) {
			CRAY_ERR("list_str_to_array cnt %d expected %u",
				 cnt, job->nnodes);
			xfree(pe_nid_map);
			xfree(nodes);
			return NULL;
		}

		// Search the task id map for the values we need
		tasks_to_launch_sum = 0;
		for (i = 0; i < job->nnodes; i++) {
			tasks_to_launch_sum += job->msg->tasks_to_launch[i];
			for (j = 0; j < job->msg->tasks_to_launch[i]; j++) {
				task = job->msg->global_task_ids[i][j];
				pe_nid_map[task] = nodes[i];
			}
		}

		// If this is LAM/MPI only one task per node is launched,
		// NOT job->ntasks. So fill in the rest of the tasks
		// assuming a block distribution
		if (tasks_to_launch_sum == job->nnodes
			&& job->nnodes < job->ntasks) {
			nid = nodes[0]; // failsafe value
			for (i = 0; i < job->ntasks; i++) {
				if (pe_nid_map[i] > -1) {
					nid = pe_nid_map[i];
				} else {
					pe_nid_map[i] = nid;
				}
			}
		}
		xfree(nodes);

		// Make sure we didn't miss any tasks
		for (i = 0; i < job->ntasks; i++) {
			if (pe_nid_map[i] == -1) {
				CRAY_ERR("No NID for PE index %d", i);
				xfree(pe_nid_map);
				return NULL;
			}
		}
	}
	return pe_nid_map;
}