Esempio n. 1
0
static int _task_cgroup_cpuset_dist_block(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, uint32_t nobj,
	stepd_step_rec_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t core_loop, ntskip, npdist;
	uint32_t i, j, pfirst, plast;
	uint32_t taskid = job->envtp->localid;
	int hwdepth;
	uint32_t npus, ncores, nsockets;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t *thread_idx;
	uint32_t core_idx;
	bool core_fcyclic, core_block;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	core_block = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_COREBLOCK ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	thread_idx = xmalloc(ncores * sizeof(uint32_t));

	if (bind_verbose) {
		info("task/cgroup: task[%u] using block distribution, "
		     "task_dist 0x%x", taskid, job->task_dist);
	}

	if ((hwloc_compare_types(hwtype, HWLOC_OBJ_PU) == 0) && !core_block) {
		thread_idx = xmalloc(ncores * sizeof(uint32_t));
		ntskip = taskid;
		npdist = job->cpus_per_task;

		i = 0; j = 0;
		core_idx = 0;
		core_loop = 0;
		while (i < ntskip + 1 && core_loop < npdist + 1) {
			while ((core_idx < ncores) && (j < npdist)) {
				obj = hwloc_get_obj_below_by_type(
					topology, HWLOC_OBJ_CORE, core_idx,
					hwtype, thread_idx[core_idx]);
				if (obj != NULL) {
					thread_idx[core_idx]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
					if ((j < npdist) && core_fcyclic)
						core_idx++;
				} else {
					core_idx++;
				}
			}
			if (j == npdist) {
				i++; j = 0;
				core_idx++; // no validity check, handled by the while
				core_loop = 0;
			} else {
				core_loop++;
				core_idx = 0;
			}
		}
		xfree(thread_idx);

		/* should never happen in normal scenario */
		if (core_loop > npdist) {
			error("task/cgroup: task[%u] infinite loop broken while "
			      "trying to provision compute elements using %s",
			      taskid, format_task_dist_states(job->task_dist));
			return XCGROUP_ERROR;
		} else
			return XCGROUP_SUCCESS;
	}

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		pfirst = taskid * job->cpus_per_task ;
		plast = pfirst + job->cpus_per_task - 1;
	} else {
		/* sockets or ldoms granularity */
		pfirst = taskid;
		plast = pfirst;
	}

	hwdepth = hwloc_get_type_depth(topology, hwtype);
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)  &&
	    (nsockets != 0)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = MAX(1, (ncores / nsockets));
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= pfirst && i < npus; i++) {
				if (bit_test(spec_threads, i))
					pfirst++;
			};
		}
	}

	for (i = pfirst; i <= plast && i < nobj ; i++) {
		obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
		_add_hwloc_cpuset(hwtype, req_hwtype, obj, taskid,
			    bind_verbose, cpuset);
	}

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	return XCGROUP_SUCCESS;
}
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t *obj_idx;
	uint32_t i, j, sock_idx, sock_loop, ntskip, npdist, nsockets;
	uint32_t taskid = job->envtp->localid;

	if (bind_verbose)
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=%u)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	obj_idx = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = 0; j = 0;
	sock_idx = 0;
	sock_loop = 0;
	while (i < ntskip + 1 && sock_loop < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((sock_idx < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				j++;
				if (i == ntskip)
					_add_hwloc_cpuset(hwtype, req_hwtype,
							  obj, taskid,
							  bind_verbose, cpuset);
				if ((j < npdist) &&
				    (((job->task_dist & SLURM_DIST_STATE_BASE) ==
				      SLURM_DIST_CYCLIC_CFULL) ||
				     ((job->task_dist & SLURM_DIST_STATE_BASE) ==
				      SLURM_DIST_BLOCK_CFULL)))
					sock_idx++;
			} else {
				sock_idx++;
			}
		}
		/* if it succeed, switch to the next task, starting
		   with the next available socket, otherwise, loop back
		   from the first socket trying to find available slots. */
		if (j == npdist) {
			i++; j = 0;
			sock_idx++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			sock_idx = 0;
		}
	}

	xfree(obj_idx);

	/* should never happened in normal scenario */
	if (sock_loop > npdist) {
		error("task/cgroup: task[%u] infinite loop broken while trying"
		      "to provision compute elements using %s", taskid,
		      format_task_dist_states(job->task_dist));
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
Esempio n. 3
0
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t  s_ix;		/* socket index */
	uint32_t *c_ixc;	/* core index by socket (current taskid) */
	uint32_t *c_ixn;	/* core index by socket (next taskid) */
	uint32_t *t_ix;		/* thread index by core by socket */
	uint32_t npus, ncores, nsockets;
	uint32_t taskid = job->envtp->localid;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t obj_idxs[3], nthreads, cps,
		 tpc, i, j, sock_loop, ntskip, npdist;;
	bool core_cyclic, core_fcyclic, sock_fcyclic;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_CORE);
	nthreads = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_PU);
	cps = ncores/nsockets;
	tpc = nthreads/ncores;

	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
		SLURM_DIST_SOCKCFULL ? true : false;
	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECYCLIC ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	if (bind_verbose) {
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=0x%x)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	}

	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	t_ix = xmalloc(ncores * sizeof(uint32_t));
	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
	c_ixn = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)){
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = ncores / nsockets;
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= ntskip && i < npus; i++) {
				if (bit_test(spec_threads, i))
					ntskip++;
			};
		}
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = j = s_ix = sock_loop = 0;
	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((s_ix < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, s_ix,
				hwtype, c_ixc[s_ix]);
			if (obj != NULL) {
				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
									>= 0) {
					/* granularity is thread */
					obj_idxs[0]=s_ix;
					obj_idxs[1]=c_ixc[s_ix];
					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
					obj = hwloc_get_obj_below_array_by_type(
						topology, 3, obj_types, obj_idxs);
					if (obj != NULL) {
						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
						j++;
						if (i == ntskip)
							_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
						if (j < npdist) {
							if (core_cyclic) {
								c_ixn[s_ix] =
								c_ixc[s_ix] + 1;
							} else if (core_fcyclic){
								c_ixc[s_ix]++;
								c_ixn[s_ix] =
								c_ixc[s_ix];
							}
							if (sock_fcyclic)
								s_ix++;
						}
					} else {
						c_ixc[s_ix]++;
						if (c_ixc[s_ix] == cps)
							s_ix++;
					}
				} else {
					/* granularity is core or larger */
					c_ixc[s_ix]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
						  	bind_verbose, cpuset);
					if ((j < npdist) && (sock_fcyclic))
						s_ix++;
				}
			} else
				s_ix++;
		}
		/* if it succeeds, switch to the next task, starting
		 * with the next available socket, otherwise, loop back
		 * from the first socket trying to find available slots. */
		if (j == npdist) {
			i++;
			j = 0;
			s_ix++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			s_ix = 0;
		}
	}
	xfree(t_ix);
	xfree(c_ixc);
	xfree(c_ixn);

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	/* should never happen in normal scenario */
	if (sock_loop > npdist) {
		error("task/cgroup: task[%u] infinite loop broken while trying "
		      "to provision compute elements using %s", taskid,
		      format_task_dist_states(job->task_dist));
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
Esempio n. 4
0
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t  s_ix;		/* socket index */
	uint32_t *c_ixc;	/* core index by socket (current taskid) */
	uint32_t *c_ixn;	/* core index by socket (next taskid) */
	uint32_t *t_ix;		/* thread index by core by socket */
	uint16_t npus = 0, nboards = 0, nthreads = 0, ncores = 0, nsockets = 0;
	uint32_t taskid = job->envtp->localid;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
	bool core_cyclic, core_fcyclic, sock_fcyclic;
	bool hwloc_success = true;

	/*
	 * We can't trust the slurmd_conf_t *conf here as we need actual
	 * hardware instead of whatever is possibly configured.  So we need to
	 * look it up again.
	 */
	if (get_cpuinfo(&npus, &nboards, &nsockets, &ncores, &nthreads,
			NULL, NULL, NULL) != SLURM_SUCCESS) {
		/*
		 * Fall back to use allocated resources, but this may result
		 * in incorrect layout due to a uneven task distribution
		 * (e.g. 4 cores on socket 0 and 3 cores on socket 1)
		 */
		nsockets = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_SOCKET);
		ncores = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_CORE);
		nthreads = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_PU);
		npus = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							   HWLOC_OBJ_PU);
	} else {
		/* Translate cores-per-socket to total core count, etc. */
		nsockets *= nboards;
		ncores *= nsockets;
		nthreads *= ncores;
	}

	if ((nsockets == 0) || (ncores == 0))
		return XCGROUP_ERROR;
	cps = (ncores + nsockets - 1) / nsockets;
	tpc = (nthreads + ncores - 1) / ncores;

	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
		SLURM_DIST_SOCKCFULL ? true : false;
	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECYCLIC ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	if (bind_verbose) {
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=0x%x)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	}

	t_ix = xmalloc(ncores * sizeof(uint32_t));
	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
	c_ixn = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}
	if ((job->job_core_spec != NO_VAL16) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = (ncores + nsockets - 1) / nsockets;
		int threads = (npus + cores - 1) / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= ntskip && i < npus; i++) {
				if (bit_test(spec_threads, i))
					ntskip++;
			};
		}
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = j = s_ix = sock_loop = 0;
	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((s_ix < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, s_ix,
				hwtype, c_ixc[s_ix]);
			if ((obj == NULL) && (s_ix == 0) && (c_ixc[s_ix] == 0))
				hwloc_success = false;	/* Complete failure */
			if ((obj != NULL) &&
			    (hwloc_bitmap_first(obj->allowed_cpuset) != -1)) {
				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
									>= 0) {
					/* granularity is thread */
					obj_idxs[0]=s_ix;
					obj_idxs[1]=c_ixc[s_ix];
					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
					obj = hwloc_get_obj_below_array_by_type(
						topology, 3, obj_types, obj_idxs);
					if ((obj != NULL) &&
					    (hwloc_bitmap_first(
					     obj->allowed_cpuset) != -1)) {
						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
						j++;
						if (i == ntskip)
							_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
						if (j < npdist) {
							if (core_cyclic) {
								c_ixn[s_ix] =
								c_ixc[s_ix] + 1;
							} else if (core_fcyclic){
								c_ixc[s_ix]++;
								c_ixn[s_ix] =
								c_ixc[s_ix];
							}
							if (sock_fcyclic)
								s_ix++;
						}
					} else {
						c_ixc[s_ix]++;
						if (c_ixc[s_ix] == cps)
							s_ix++;
					}
				} else {
					/* granularity is core or larger */
					c_ixc[s_ix]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
						  	bind_verbose, cpuset);
					if ((j < npdist) && (sock_fcyclic))
						s_ix++;
				}
			} else
				s_ix++;
		}
		/* if it succeeds, switch to the next task, starting
		 * with the next available socket, otherwise, loop back
		 * from the first socket trying to find available slots. */
		if (j == npdist) {
			i++;
			j = 0;
			s_ix++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			s_ix = 0;
		}
	}
	xfree(t_ix);
	xfree(c_ixc);
	xfree(c_ixn);

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	/* should never happen in normal scenario */
	if ((sock_loop > npdist) && !hwloc_success) {
		/* hwloc_get_obj_below_by_type() fails if no CPU set
		 * configured, see hwloc documentation for details */
		error("task/cgroup: hwloc_get_obj_below_by_type() failing, "
		      "task/affinity plugin may be required to address bug "
		      "fixed in HWLOC version 1.11.5");
		return XCGROUP_ERROR;
	} else if (sock_loop > npdist) {
		char buf[128] = "";
		hwloc_bitmap_snprintf(buf, sizeof(buf), cpuset);
		error("task/cgroup: task[%u] infinite loop broken while trying "
		      "to provision compute elements using %s (bitmap:%s)",
		      taskid, format_task_dist_states(job->task_dist), buf);
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}