Пример #1
0
/*
 * Set the count of specialized cores at job start
 *
 * Return SLURM_SUCCESS on success
 */
extern int core_spec_p_set(uint64_t cont_id, uint16_t core_count)
{
	DEF_TIMERS;
	START_TIMER;
#if _DEBUG
	char *spec_type;
	int spec_count;
	if (core_count == NO_VAL16) {
		spec_type  = "Cores";
		spec_count = 0;
	} else if (core_count & CORE_SPEC_THREAD) {
		spec_type  = "Threads";
		spec_count = core_count & (~CORE_SPEC_THREAD);
	} else {
		spec_type  = "Cores";
		spec_count = core_count;
	}
	info("core_spec_p_set(%"PRIu64") to %d %s",
	     cont_id, spec_count, spec_type);
#endif

#ifdef HAVE_NATIVE_CRAY
	int rc;
	struct job_set_affinity_info affinity_info;
	pid_t pid;
	int i;

	// Skip core spec setup for no specialized cores
	if ((core_count == NO_VAL16) ||
	    (core_count == CORE_SPEC_THREAD)) {
		return SLURM_SUCCESS;
	}
	core_count &= (~CORE_SPEC_THREAD);

	// Set the core spec information
	// Retry because there's a small timing window during preemption
	// when two core spec jobs can be running at once.
	for (i = 0; i < CORE_SPEC_RETRIES; i++) {
		if (i) {
			sleep(1);
		}

		errno = 0;
		rc = job_set_corespec(cont_id, core_count, NULL);
		if (rc == 0 || errno != EINVAL) {
			break;
		}
	}
	if (rc != 0) {
		debug("job_set_corespec(%"PRIu64", %"PRIu16") failed: %m",
		      cont_id, core_count);
		return SLURM_ERROR;
	}

	// Get a pid in the job to use with job_set_affinity
	pid = job_getprimepid(cont_id);
	if (pid < 0) {
		error("job_getprimepid(%"PRIu64") returned %d: %m",
		      cont_id, (int)pid);
		return SLURM_ERROR;
	}

	// Apply the core specialization with job_set_affinity
	// JOB_AFFINITY_NONE tells the kernel to not alter the process'
	// affinity unless required (the process is only allowed to run
	// on cores that will be specialized).
	memset(&affinity_info, 0, sizeof(struct job_set_affinity_info));
	affinity_info.cpu_list = JOB_AFFINITY_NONE;
	rc = job_set_affinity(cont_id, pid, &affinity_info);
	if (rc != 0) {
		if (affinity_info.message != NULL) {
			error("job_set_affinity(%"PRIu64", %zu) failed %s: %m",
			      cont_id, (size_t)pid, affinity_info.message);
			free(affinity_info.message);
		} else {
			error("job_set_affinity(%"PRIu64", %zu) failed: %m",
			      cont_id, (size_t)pid);
		}
		return SLURM_ERROR;
	} else if (affinity_info.message != NULL) {
		info("job_set_affinity(%"PRIu64", %zu): %s",
		     cont_id, (size_t)pid, affinity_info.message);
		free(affinity_info.message);
	}
#endif
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_TIME_CRAY)
		INFO_LINE("call took: %s", TIME_STR);

	// The code that was here is now performed by
	// switch_p_job_step_{pre,post}_suspend()
	return SLURM_SUCCESS;
}
Пример #2
0
/*
 * Set the count of specialized cores at job start
 *
 * Return SLURM_SUCCESS on success
 */
extern int core_spec_p_set(uint64_t cont_id, uint16_t core_count)
{
#if _DEBUG
	info("core_spec_p_set(%"PRIu64") to %u", cont_id, core_count);
#endif

#ifdef HAVE_NATIVE_CRAY
	int rc;
	struct job_set_affinity_info affinity_info;
	pid_t pid;
	int i;

	// Skip core spec setup for no specialized cores
	if ((core_count == (uint16_t) NO_VAL) || (core_count < 1)) {
		return SLURM_SUCCESS;
	}

	// Set the core spec information
	// Retry because there's a small timing window during preemption
	// when two core spec jobs can be running at once.
	for (i = 0; i < CORE_SPEC_RETRIES; i++) {
		if (i) {
			sleep(1);
		}

		errno = 0;
		rc = job_set_corespec(cont_id, core_count, NULL);
		if (rc == 0 || errno != EINVAL) {
			break;
		}
	}
	if (rc != 0) {
		error("job_set_corespec(%"PRIu64", %"PRIu16") failed: %m",
		      cont_id, core_count);
		return SLURM_ERROR;
	}

	pid = getpid();

	// Slurm detaches the slurmstepd from the job, so we temporarily
	// reattach so the job_set_affinity doesn't mess up one of the
	// task's affinity settings
	if (job_attachpid(pid, cont_id) == (jid_t)-1) {
		error("job_attachpid(%zu, %"PRIu64") failed: %m",
		      (size_t)pid, cont_id);
		return SLURM_ERROR;
	}

	// Apply the core specialization with job_set_affinity
	// Use NONE for the cpu list because Slurm handles its
	// own task->cpu binding
	memset(&affinity_info, 0, sizeof(struct job_set_affinity_info));
	affinity_info.cpu_list = JOB_AFFINITY_NONE;
	rc = job_set_affinity(cont_id, pid, &affinity_info);
	if (rc != 0) {
		if (affinity_info.message != NULL) {
			error("job_set_affinity(%"PRIu64", %zu) failed %s: %m",
			      cont_id, (size_t)pid, affinity_info.message);
			free(affinity_info.message);
		} else {
			error("job_set_affinity(%"PRIu64", %zu) failed: %m",
			      cont_id, (size_t)pid);
		}
		job_detachpid(pid);
		return SLURM_ERROR;
	} else if (affinity_info.message != NULL) {
		info("job_set_affinity(%"PRIu64", %zu): %s",
		     cont_id, (size_t)pid, affinity_info.message);
		free(affinity_info.message);
	}
	job_detachpid(pid);
#endif
	// The code that was here is now performed by
	// switch_p_job_step_{pre,post}_suspend()
	return SLURM_SUCCESS;
}