Exemple #1
0
int slurm_getaffinity(pid_t pid, size_t size, cpu_set_t *mask)
{
	int rval;
	char mstr[1 + CPU_SETSIZE / 4];

	CPU_ZERO(mask);
#ifdef SCHED_GETAFFINITY_THREE_ARGS
	rval = sched_getaffinity(pid, size, mask);
#else
	rval = sched_getaffinity(pid, mask);
#endif
	if (rval) {
		verbose("sched_getaffinity(%d,%zd,0x%s) failed with status %d",
				pid, size, cpuset_to_str(mask, mstr), rval);
	} else {
		debug3("sched_getaffinity(%d) = 0x%s",
		       pid, cpuset_to_str(mask, mstr));
	}
	return (rval);
}
/*
 * init() is called when the plugin is loaded, before any other functions
 *	are called.  Put global initialization here.
 */
extern int init (void)
{
	cpu_set_t cur_mask;
	char mstr[1 + CPU_SETSIZE / 4];

	slurm_getaffinity(0, sizeof(cur_mask), &cur_mask);
	cpuset_to_str(&cur_mask, mstr);
	verbose("%s loaded with CPU mask %s", plugin_name, mstr);

	return SLURM_SUCCESS;
}
Exemple #3
0
int set_affinity(pid_t pid, cpu_set_t *mask)
{
	int ret;
	CPUSET_HEXSTRING(aff_hex);

	if((ret=sched_setaffinity(pid, sizeof(cpu_set_t), mask)) == -1) {
		decode_error("could not set PID %d to affinity 0x%s",
			     pid,
			     cpuset_to_str(mask, aff_hex)
			    );
		return(ret);
	}
        return(0);
}
Exemple #4
0
int slurm_setaffinity(pid_t pid, size_t size, const cpu_set_t *mask)
{
	int rval;
	char mstr[1 + CPU_SETSIZE / 4];

#ifdef SCHED_GETAFFINITY_THREE_ARGS
	rval = sched_setaffinity(pid, size, mask);
#else
	rval = sched_setaffinity(pid, mask);
#endif
	if (rval) {
		verbose("sched_setaffinity(%d,%zd,0x%s) failed: %m",
			pid, size, cpuset_to_str(mask, mstr));
	}
	return (rval);
}
/* The job has specialized cores, synchronize user mask with available cores */
static void _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask)
{
	char *new_mask = NULL, *save_ptr = NULL, *tok;
	cpu_set_t avail_cpus, task_cpus;
	bool superset = true;

	CPU_ZERO(&avail_cpus);
	(void) str_to_cpuset(&avail_cpus, avail_mask);
	tok = strtok_r(req->cpu_bind, ",", &save_ptr);
	while (tok) {
		int i, overlaps = 0;
		char mask_str[1 + CPU_SETSIZE / 4];
		CPU_ZERO(&task_cpus);
		(void) str_to_cpuset(&task_cpus, tok);
		for (i = 0; i < CPU_SETSIZE; i++) {
			if (!CPU_ISSET(i, &task_cpus))
				continue;
			if (CPU_ISSET(i, &avail_cpus)) {
				overlaps++;
			} else {
				CPU_CLR(i, &task_cpus);
				superset = false;
			}
		}
		if (overlaps == 0) {
			/* The task's CPU mask is completely invalid.
			 * Give it all allowed CPUs. */
			for (i = 0; i < CPU_SETSIZE; i++) {
				if (CPU_ISSET(i, &avail_cpus))
					CPU_SET(i, &task_cpus);
			}
		}
		cpuset_to_str(&task_cpus, mask_str);
		if (new_mask)
			xstrcat(new_mask, ",");
		xstrcat(new_mask, mask_str);
		tok = strtok_r(NULL, ",", &save_ptr);
	}

	if (!superset) {
		info("task/affinity: Ignoring user CPU binding outside of job "
		     "step allocation");
	}

	xfree(req->cpu_bind);
	req->cpu_bind = new_mask;
}
Exemple #6
0
void slurm_chkaffinity(cpu_set_t *mask, slurmd_job_t *job, int statval)
{
	char *bind_type, *action, *status, *units;
	char mstr[1 + CPU_SETSIZE / 4];
	int task_gid = job->envtp->procid;
	int task_lid = job->envtp->localid;
	pid_t mypid = job->envtp->task_pid;

	if (!(job->cpu_bind_type & CPU_BIND_VERBOSE))
		return;

	if (statval)
		status = " FAILED";
	else
		status = "";

	if (job->cpu_bind_type & CPU_BIND_NONE) {
		action = "";
		units  = "";
		bind_type = "NONE";
	} else {
		action = " set";
		if (job->cpu_bind_type & CPU_BIND_TO_THREADS)
			units = "_threads";
		else if (job->cpu_bind_type & CPU_BIND_TO_CORES)
			units = "_cores";
		else if (job->cpu_bind_type & CPU_BIND_TO_SOCKETS)
			units = "_sockets";
		else if (job->cpu_bind_type & CPU_BIND_TO_LDOMS)
			units = "_ldoms";
		else
			units = "";
		if (job->cpu_bind_type & CPU_BIND_RANK) {
			bind_type = "RANK";
		} else if (job->cpu_bind_type & CPU_BIND_MAP) {
			bind_type = "MAP ";
		} else if (job->cpu_bind_type & CPU_BIND_MASK) {
			bind_type = "MASK";
		} else if (job->cpu_bind_type & CPU_BIND_LDRANK) {
			bind_type = "LDRANK";
		} else if (job->cpu_bind_type & CPU_BIND_LDMAP) {
			bind_type = "LDMAP ";
		} else if (job->cpu_bind_type & CPU_BIND_LDMASK) {
			bind_type = "LDMASK";
		} else if (job->cpu_bind_type & (~CPU_BIND_VERBOSE)) {
			bind_type = "UNK ";
		} else {
			action = "";
			bind_type = "NULL";
		}
	}

	fprintf(stderr, "cpu_bind%s=%s - "
			"%s, task %2u %2u [%u]: mask 0x%s%s%s\n",
			units, bind_type,
			conf->hostname,
			task_gid,
			task_lid,
			mypid,
			cpuset_to_str(mask, mstr),
			action,
			status);
}
int main(int argc, char *argv[])
{
	cpu_set_t new_mask, cur_mask;
	pid_t pid = 0;
	int opt, err;
	char mstr[1 + CPU_SETSIZE / 4];
	char cstr[7 * CPU_SETSIZE];
	int c_opt = 0;

	struct option longopts[] = {
		{ "pid",	0, NULL, 'p' },
		{ "cpu-list",	0, NULL, 'c' },
		{ "help",	0, NULL, 'h' },
		{ "version",	0, NULL, 'V' },
		{ NULL,		0, NULL, 0 }
	};

	while ((opt = getopt_long(argc, argv, "+pchV", longopts, NULL)) != -1) {
		int ret = 1;

		switch (opt) {
		case 'p':
			pid = atoi(argv[argc - 1]);
			break;
		case 'c':
			c_opt = 1;
			break;
		case 'V':
			printf("taskset version " VERSION "\n");
			return 0;
		case 'h':
			ret = 0;
		default:
			show_usage(argv[0]);
			return ret;
		}
	}

	if ((!pid && argc - optind < 2)
			|| (pid && (argc - optind < 1 || argc - optind > 2))) {
		show_usage(argv[0]);
		return 1;
	}

	if (pid) {
		if (sched_getaffinity(pid, sizeof (cur_mask), &cur_mask) < 0) {
			perror("sched_getaffinity");
			fprintf(stderr, "failed to get pid %d's affinity\n",
				pid);
			return 1;
		}
		if (c_opt)
			printf("pid %d's current affinity list: %s\n", pid,
				cpuset_to_cstr(&cur_mask, cstr));
		else
			printf("pid %d's current affinity mask: %s\n", pid,
				cpuset_to_str(&cur_mask, mstr));

		if (argc - optind == 1)
			return 0;
	}

	if (c_opt)
		err = cstr_to_cpuset(&new_mask, argv[optind]);
	else
		err = str_to_cpuset(&new_mask, argv[optind]);

	if (err) {
		if (c_opt)
			fprintf(stderr, "failed to parse CPU list %s\n",
				argv[optind]);
		else
			fprintf(stderr, "failed to parse CPU mask %s\n",
				argv[optind]);
		return 1;
	}

	if (sched_setaffinity(pid, sizeof (new_mask), &new_mask)) {
		perror("sched_setaffinity");
		fprintf(stderr, "failed to set pid %d's affinity.\n", pid);
		return 1;
	}

	if (sched_getaffinity(pid, sizeof (cur_mask), &cur_mask) < 0) {
		perror("sched_getaffinity");
		fprintf(stderr, "failed to get pid %d's affinity.\n", pid);
		return 1;
	}

	if (pid) {
		if (c_opt)
			printf("pid %d's new affinity list: %s\n", pid, 
		               cpuset_to_cstr(&cur_mask, cstr));
		else
			printf("pid %d's new affinity mask: %s\n", pid, 
		               cpuset_to_str(&cur_mask, mstr));
	} else {
		argv += optind + 1;
		execvp(argv[0], argv);
		perror("execvp");
		fprintf(stderr, "failed to execute %s\n", argv[0]);
		return 1;
	}

	return 0;
}
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	char mstr[1 + CPU_SETSIZE / 4];
	cpu_bind_type_t bind_type;
	cpu_set_t ts;
	hwloc_obj_t obj;
	hwloc_obj_type_t socket_or_node;
	hwloc_topology_t topology;
	hwloc_bitmap_t cpuset;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int bind_verbose = 0;
	int rc = SLURM_SUCCESS, match;
	pid_t    pid = job->envtp->task_pid;
	size_t tssize;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus;

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
	hwloc_topology_load(topology);
	cpuset = hwloc_bitmap_alloc();

	int spec_threads = 0;

	if (job->batch) {
		jnpus = job->cpus;
		job->cpus_per_task = job->cpus;
	} else
		jnpus = jntasks * job->cpus_per_task;

	bind_type = job->cpu_bind_type;
	if ((conf->task_plugin_param & CPU_BIND_VERBOSE) ||
	    (bind_type & CPU_BIND_VERBOSE))
		bind_verbose = 1 ;

	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
		/* One socket contains multiple NUMA-nodes
		 * like AMD Opteron 6000 series etc.
		 * In such case, use NUMA-node instead of socket. */
		socket_or_node = HWLOC_OBJ_NODE;
	} else {
		socket_or_node = HWLOC_OBJ_SOCKET;
	}

	if (bind_type & CPU_BIND_NONE) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = socket_or_node;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else if (bind_type & CPU_BIND_TO_BOARDS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "board level binding",taskid);
		req_hwtype = HWLOC_OBJ_GROUP;
	} else if (bind_type & bind_mode_ldom) {
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (bind_verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       socket_or_node);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);
	//info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms);

	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)) {
		spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD);
	}
	if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	    bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = socket_or_node;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	    nsockets >= nldoms &&
	    bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) {
		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid, hwloc_obj_type_string(hwtype));

	} else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) &&
		   (nobj < jnpus)) {
		info("task/cgroup: task[%u] not enough %s objects (%d < %d), "
		     "disabling affinity",
		     taskid, hwloc_obj_type_string(hwtype), nobj, jnpus);

	} else if (bind_type & bind_mode) {
		/* Explicit binding mode specified by the user
		 * Bind the taskid in accordance with the specified mode
		 */
		obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0);
		match = hwloc_bitmap_isequal(obj->complete_cpuset,
					     obj->allowed_cpuset);
		if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) {
			info("task/cgroup: entire node must be allocated, "
			     "disabling affinity, task[%u]", taskid);
			fprintf(stderr, "Requested cpu_bind option requires "
				"entire node to be allocated; disabling "
				"affinity\n");
		} else {
			if (bind_verbose) {
				info("task/cgroup: task[%u] is requesting "
				     "explicit binding mode", taskid);
			}
			_get_sched_cpuset(topology, hwtype, req_hwtype, &ts,
					  job);
			tssize = sizeof(cpu_set_t);
			fstatus = SLURM_SUCCESS;
			if (job->job_core_spec != (uint16_t) NO_VAL)
				_validate_mask(taskid, obj, &ts);
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "mask 0x%s", taskid,
				      cpuset_to_str(&ts, mstr));
				error("sched_setaffinity rc = %d", rc);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] mask 0x%s",
				     taskid, cpuset_to_str(&ts, mstr));
			}
			_slurm_chkaffinity(&ts, job, rc);
		}
	} else {
		/* Bind the detected object to the taskid, respecting the
		 * granularity, using the designated or default distribution
		 * method (block or cyclic). */
		char *str;

		if (bind_verbose) {
			info("task/cgroup: task[%u] using %s granularity dist %u",
			     taskid, hwloc_obj_type_string(hwtype),
			     job->task_dist);
		}

		/* See srun man page for detailed information on --distribution
		 * option.
		 *
		 * You can see the equivalent code for the
		 * task/affinity plugin in
		 * src/plugins/task/affinity/dist_tasks.c, around line 368
		 */
		switch (job->task_dist & SLURM_DIST_NODESOCKMASK) {
		case SLURM_DIST_BLOCK_BLOCK:
		case SLURM_DIST_CYCLIC_BLOCK:
		case SLURM_DIST_PLANE:
			/* tasks are distributed in blocks within a plane */
			_task_cgroup_cpuset_dist_block(topology,
				hwtype, req_hwtype,
				nobj, job, bind_verbose, cpuset);
			break;
		case SLURM_DIST_ARBITRARY:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_UNKNOWN:
			if (slurm_get_select_type_param()
			    & CR_CORE_DEFAULT_DIST_BLOCK) {
				_task_cgroup_cpuset_dist_block(topology,
					hwtype, req_hwtype,
					nobj, job, bind_verbose, cpuset);
				break;
			}
			/* We want to fall through here if we aren't doing a
			   default dist block.
			*/
		default:
			_task_cgroup_cpuset_dist_cyclic(topology,
				hwtype, req_hwtype,
				job, bind_verbose, cpuset);
			break;
		}

		hwloc_bitmap_asprintf(&str, cpuset);

		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset,
							 &ts, tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'", taskid, str);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] set taskset '%s'",
				     taskid, str);
			}
			_slurm_chkaffinity(&ts, job, rc);
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);
	}

	/* Destroy hwloc objects */
	hwloc_bitmap_free(cpuset);
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
Exemple #9
0
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	char mstr[1 + CPU_SETSIZE / 4];
	cpu_bind_type_t bind_type;
	cpu_set_t ts;
	hwloc_obj_t obj;
	hwloc_obj_type_t socket_or_node;
	hwloc_topology_t topology;
	hwloc_bitmap_t cpuset;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int bind_verbose = 0;
	int rc = SLURM_SUCCESS;
	pid_t    pid = job->envtp->task_pid;
	size_t tssize;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus = jntasks * job->cpus_per_task;

	bind_type = job->cpu_bind_type;
	if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
	    bind_type & CPU_BIND_VERBOSE)
		bind_verbose = 1 ;

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
	hwloc_topology_load(topology);
	cpuset = hwloc_bitmap_alloc();

	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
		/* One socket contains multiple NUMA-nodes
		 * like AMD Opteron 6000 series etc.
		 * In such case, use NUMA-node instead of socket. */
		socket_or_node = HWLOC_OBJ_NODE;
	} else {
		socket_or_node = HWLOC_OBJ_SOCKET;
	}

	if (bind_type & CPU_BIND_NONE) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = socket_or_node;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else if (bind_type & CPU_BIND_TO_BOARDS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "board level binding",taskid);
		req_hwtype = HWLOC_OBJ_GROUP;
	} else if (bind_type & bind_mode_ldom) {
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (bind_verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       socket_or_node);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);

	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	    bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = socket_or_node;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	    nsockets >= nldoms &&
	    bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {

		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid,hwloc_obj_type_string(hwtype));

	} else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
		   jnpus > nobj) {

		info("task/cgroup: task[%u] not enough %s objects, disabling "
		     "affinity",taskid,hwloc_obj_type_string(hwtype));

	} else if (bind_type & bind_mode) {
		/* Explicit binding mode specified by the user
		 * Bind the taskid in accordance with the specified mode
		 */
		obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0);
		if (!hwloc_bitmap_isequal(obj->complete_cpuset,
				obj->allowed_cpuset)) {
			info("task/cgroup: entire node must be allocated, "
			     "disabling affinity, task[%u]", taskid);
			fprintf(stderr, "Requested cpu_bind option requires "
				"entire node to be allocated; disabling "
				"affinity\n");
		} else {
			if (bind_verbose)
				info("task/cgroup: task[%u] is requesting "
					"explicit binding mode",taskid);
			_get_sched_cpuset(topology, hwtype, req_hwtype, &ts,
					job);
			tssize = sizeof(cpu_set_t);
			fstatus = SLURM_SUCCESS;
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
					"mask 0x%s", taskid,
					cpuset_to_str(&ts, mstr));
				fstatus = SLURM_ERROR;
			} else if (bind_verbose)
				info("task/cgroup: task[%u] mask 0x%s",
					taskid, cpuset_to_str(&ts, mstr));
			slurm_chkaffinity(&ts, job, rc);
		}
	} else {
		/* Bind the detected object to the taskid, respecting the
		 * granularity, using the designated or default distribution
		 * method (block or cyclic). */
		char *str;

		if (bind_verbose) {
			info("task/cgroup: task[%u] using %s granularity",
			     taskid,hwloc_obj_type_string(hwtype));
		}

		/* There are two "distributions,"  controlled by the
		 * -m option of srun and friends. The first is the
		 * distribution of tasks to nodes.  The second is the
		 * distribution of allocated cpus to tasks for
		 * binding.  This code is handling the second
		 * distribution.  Here's how the values get set, based
		 * on the value of -m
		 *
		 * SLURM_DIST_CYCLIC = srun -m cyclic
		 * SLURM_DIST_BLOCK = srun -m block
		 * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic
		 * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic
		 *
		 * In the first two cases, the user only specified the
		 * first distribution.  The second distribution
		 * defaults to cyclic.  In the second two cases, the
		 * user explicitly requested a second distribution of
		 * cyclic.  So all these four cases correspond to a
		 * second distribution of cyclic.   So we want to call
		 * _task_cgroup_cpuset_dist_cyclic.
		 *
		 * If the user explicitly specifies a second
		 * distribution of block, or if
		 * CR_CORE_DEFAULT_DIST_BLOCK is configured and the
		 * user does not explicitly specify a second
		 * distribution of cyclic, the second distribution is
		 * block, and we need to call
		 * _task_cgroup_cpuset_dist_block. In these cases,
		 * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK
		 * or SLURM_DIST_BLOCK_BLOCK.
		 *
		 * You can see the equivalent code for the
		 * task/affinity plugin in
		 * src/plugins/task/affinity/dist_tasks.c, around line 384.
		 */
		switch (job->task_dist) {
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC_CYCLIC:
		case SLURM_DIST_BLOCK_CYCLIC:
			_task_cgroup_cpuset_dist_cyclic(
				topology, hwtype, req_hwtype,
				job, bind_verbose, cpuset);
			break;
		default:
			_task_cgroup_cpuset_dist_block(
				topology, hwtype, req_hwtype,
				nobj, job, bind_verbose, cpuset);
		}

		hwloc_bitmap_asprintf(&str, cpuset);

		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
							 &ts,tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if ((rc = sched_setaffinity(pid,tssize,&ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'",taskid,str);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] taskset '%s' is set"
				     ,taskid,str);
			}
			slurm_chkaffinity(&ts, job, rc);
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);
	}

	/* Destroy hwloc objects */
	hwloc_bitmap_free(cpuset);
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
Exemple #10
0
/*
 Be more careful with at least the affinity call; someone may use an
 affinity-compiled version on a non-affinity kernel.
 This is getting more and more fu-gly.
 */
void print_process(pid_t pid)
{
	int policy, nice;
	struct sched_param_ex p;
	cpu_set_t aff_mask;
	CPUSET_HEXSTRING(aff_mask_hex);


	CPU_ZERO(&aff_mask);

	/* strict error checking not needed - it works or not. */
        errno=0;
	if( ((policy=sched_getscheduler(pid)) < 0)
	    || (sched_getparam_ex(pid, sizeof(p), &p) < 0)
	    /* getpriority may successfully return negative values, so errno needs to be checked */
	    || ((nice=getpriority(PRIO_PROCESS, pid)) && errno)
	  ) {
		decode_error("could not get scheduling-information for PID %d", pid);

	} else {

		/* do custom output for unknown policy */
		if(! CHECK_RANGE_POLICY(policy)) {
			printf("PID %5d: PRIO %3d, POLICY %-5d <UNKNOWN>, NICE %3d",
			       pid,
			       p.sched_priority,
			       policy,
			       nice
			      );
		} else {

			printf("PID %5d: PRIO %3d, POLICY %-17s, NICE %3d",
			       pid,
			       p.sched_priority,
			       TAB[policy],
			       nice
			      );

			if (policy == SCHED_DEADLINE) {
				printf(", RUNTIME %Ldus DEADLINE %Ldus FLAGS 0x%04x"
				       ", CURR. RUNTIME %Ldus USED RUNTIME %Ldus",
				       tspec_to_us(&p.sched_runtime),
				       tspec_to_us(&p.sched_deadline),
				       p.sched_flags,
				       tspec_to_us(&p.curr_runtime),
				       tspec_to_us(&p.used_runtime));
			}
		}

		/*
		 sched_getaffinity() seems to also return (int)4 on 2.6.8+ on x86 when successful.
		 this goes against the documentation
                 */
		if(sched_getaffinity(pid, sizeof(aff_mask), &aff_mask) == -1) {
			/*
			 error or -ENOSYS
                         simply ignore and reset errno!
			 */
                        errno=0;
		} else {
			printf(", AFFINITY 0x%s", cpuset_to_str(&aff_mask, aff_mask_hex));
		}
	printf("\n");
	}
}
Exemple #11
0
int engine(struct engine_s *e)
{
	int ret=0;
	int i;

#ifdef DEBUG
	do {
		CPUSET_HEXSTRING(tmpaff);
		printf("Dumping mode: 0x%x\n", e->mode);
		printf("Dumping affinity: 0x%s\n", cpuset_to_str(&(e->aff_mask), tmpaff));
		printf("We have %d args to do\n", e->n);
		for(i=0;i < e->n; i++) {
			printf("Dump arg %d: %s\n", i, e->args[i]);
		}
	} while(0);
#endif

	/*
	 handle normal query/set operation:
	 set/query all given PIDs
	 */
	for(i=0; i < e->n; i++) {

		int pid, tmpret=0;
		cpu_set_t affi;

		CPU_ZERO(&affi);
                CPU_SET(0, &affi);

                /* if in MODE_EXEC skip check for PIDs */
		if(mode_set(e->mode, MODE_EXEC)) {
			pid=getpid();
			goto exec_mode_special;
		}

		if(! (isdigit( *(e->args[i])) ) ) {
			decode_error("Ignoring arg %s: is not a PID", e->args[i]);
			continue;
		}

		pid=atoi(e->args[i]);

	exec_mode_special:
		if(mode_set(e->mode, MODE_SETPOLICY)) {
			struct sched_param_ex p;

			p.sched_priority= e->prio;
			p.sched_runtime=us_to_tspec(e->rtime);
			p.sched_deadline=us_to_tspec(e->dline);
			p.sched_period=us_to_tspec(e->priod);
			p.sched_flags=e->flags;

			/*
			 accumulate possible errors
			 the return value of main will indicate
			 how much set-calls went wrong
                         set_process returns -1 upon failure
			 */
			tmpret=set_process(pid,e->policy,&p);
			ret += tmpret;

                        /* don't proceed as something went wrong already */
			if(tmpret) {
				continue;
			}

		}

		if(mode_set(e->mode, MODE_NICE)) {
			tmpret=set_niceness(pid, e->nice);
                        ret += tmpret;

			if(tmpret) {
				continue;
			}

		}

		if(mode_set(e->mode, MODE_AFFINITY)) {
			tmpret=set_affinity(pid, &(e->aff_mask));
			ret += tmpret;

			if(tmpret) {
				continue;
			}

		}

		/* and print process info when set, too */
		if(mode_set(e->mode, MODE_PRINT)) {
			print_process(pid);
		}


		/* EXECUTE: at the end */
		if(mode_set(e->mode, MODE_EXEC)) {

			char **new_argv=e->args;

			ret=execvp(*new_argv, new_argv);

			/* only reached on error */
			decode_error("schedtool: Could not exec %s", *new_argv);
			return(ret);
		}
	}
	/*
	 indicate how many errors we got; as ret is accumulated negative,
	 convert to positive
	 */
	return(abs(ret));
}