/****** binding_support/get_processor_ids() ******************************
*  NAME
*     get_processor_ids() -- Get internal processor ids for a specific core.
*
*  SYNOPSIS
*     bool get_processor_ids(int socket_number, int core_number, int**
*     proc_ids, int* amount)
*
*  FUNCTION
*     Get the internal processor ids for a given core (specified by a socket,
*     core pair).
*
*  INPUTS
*     int socket_number - Logical socket number (starting at 0 without holes)
*     int core_number   - Logical core number on the socket (starting at 0 without holes)
*
*  OUTPUTS
*     int** proc_ids    - Array of internal processor ids.
*     int* amount       - Size of the proc_ids array.
*
*  RESULT
*     bool - Returns true when processor ids where found otherwise false.
*
*  NOTES
*     MT-NOTE: get_processor_ids() is MT safe
*
*******************************************************************************/
bool get_processor_ids(int socket_number, int core_number, int** proc_ids, int* amount)
{
#if HAVE_HWLOC
   int i, count;
   hwloc_obj_t pu, parent;
   struct hwloc_obj **children;
   hwloc_obj_t core =
      hwloc_get_obj_below_by_type(sge_hwloc_topology, HWLOC_OBJ_SOCKET,
                                  socket_number, HWLOC_OBJ_CORE, core_number);
   if (core)
      pu = hwloc_get_obj_below_by_type(sge_hwloc_topology, HWLOC_OBJ_CORE,
                                       core->logical_index, HWLOC_OBJ_PU, 0);
   else
      return false;
   parent = pu->parent;
   count = parent->arity;
   if (count <= 0) return false;
   children = parent->children;
   (*amount) = count;
   (*proc_ids) = (int *) sge_malloc(count * sizeof(int));
   for (i = 0; i < count; i++)
      (*proc_ids)[i] = children[i]->os_index;
   return true;
#else
   return false;
#endif
}
/****** binding_support/get_number_of_threads() **************************************
*  NAME
*     get_number_of_threads() -- Get number of threads a specific core supports.
*
*  SYNOPSIS
*     int get_number_of_threads(int socket_number, int core_number)
*
*  FUNCTION
*     Returns the number of threads a specific core supports.
*
*  INPUTS
*     int socket_number - Logical socket number starting at 0.
*     int core_number   - Logical core number on socket starting at 0.
*
*  RESULT
*     int - Amount of threads a specific core supports.
*
*  NOTES
*     MT-NOTE: get_number_of_threads() is MT safe
*
*******************************************************************************/
int get_number_of_threads(int socket_number, int core_number) {
#if HAVE_HWLOC
   hwloc_obj_t core =
     hwloc_get_obj_below_by_type(sge_hwloc_topology, HWLOC_OBJ_SOCKET,
                                 socket_number, HWLOC_OBJ_CORE, core_number);
   return count_type_under(core, HWLOC_OBJ_PU);
#else
   return 0;
#endif
}
/*
 * Distribute cpus to the task using cyclic distribution across sockets
 */
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, slurmd_job_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t *obj_idx;
	uint32_t i, sock_idx, npskip, npdist, nsockets;
	uint32_t taskid = job->envtp->localid;

	if (bind_verbose)
		info("task/cgroup: task[%u] using cyclic distribution, "
		     "task_dist %u", taskid, job->task_dist);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	obj_idx = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		npskip = taskid * job->cpus_per_task;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		npskip = taskid;
		npdist = 1;
	}

	/* skip objs for lower taskids */
	i = 0;
	sock_idx = 0;
	while (i < npskip) {
		while ((sock_idx < nsockets) && (i < npskip)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				i++;
			}
			sock_idx++;
		}
		if (i < npskip)
			sock_idx = 0;
	}

	/* distribute objs cyclically across sockets */
	i = npdist;
	while (i > 0) {
		while ((sock_idx < nsockets) && (i > 0)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				_add_cpuset(hwtype, req_hwtype, obj, taskid,
					    bind_verbose, cpuset);
				i--;
			}
			sock_idx++;
		}
		sock_idx = 0;
	}
	xfree(obj_idx);
	return XCGROUP_SUCCESS;
}
static int _task_cgroup_cpuset_dist_block(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, uint32_t nobj,
	stepd_step_rec_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t core_loop, ntskip, npdist;
	uint32_t i, j, pfirst, plast;
	uint32_t taskid = job->envtp->localid;
	int hwdepth;
	uint32_t npus, ncores, nsockets;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t *thread_idx;
	uint32_t core_idx;
	bool core_fcyclic, core_block;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	core_block = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_COREBLOCK ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	thread_idx = xmalloc(ncores * sizeof(uint32_t));

	if (bind_verbose) {
		info("task/cgroup: task[%u] using block distribution, "
		     "task_dist 0x%x", taskid, job->task_dist);
	}

	if ((hwloc_compare_types(hwtype, HWLOC_OBJ_PU) == 0) && !core_block) {
		thread_idx = xmalloc(ncores * sizeof(uint32_t));
		ntskip = taskid;
		npdist = job->cpus_per_task;

		i = 0; j = 0;
		core_idx = 0;
		core_loop = 0;
		while (i < ntskip + 1 && core_loop < npdist + 1) {
			while ((core_idx < ncores) && (j < npdist)) {
				obj = hwloc_get_obj_below_by_type(
					topology, HWLOC_OBJ_CORE, core_idx,
					hwtype, thread_idx[core_idx]);
				if (obj != NULL) {
					thread_idx[core_idx]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
					if ((j < npdist) && core_fcyclic)
						core_idx++;
				} else {
					core_idx++;
				}
			}
			if (j == npdist) {
				i++; j = 0;
				core_idx++; // no validity check, handled by the while
				core_loop = 0;
			} else {
				core_loop++;
				core_idx = 0;
			}
		}
		xfree(thread_idx);

		/* should never happen in normal scenario */
		if (core_loop > npdist) {
			error("task/cgroup: task[%u] infinite loop broken while "
			      "trying to provision compute elements using %s",
			      taskid, format_task_dist_states(job->task_dist));
			return XCGROUP_ERROR;
		} else
			return XCGROUP_SUCCESS;
	}

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		pfirst = taskid * job->cpus_per_task ;
		plast = pfirst + job->cpus_per_task - 1;
	} else {
		/* sockets or ldoms granularity */
		pfirst = taskid;
		plast = pfirst;
	}

	hwdepth = hwloc_get_type_depth(topology, hwtype);
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)  &&
	    (nsockets != 0)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = MAX(1, (ncores / nsockets));
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= pfirst && i < npus; i++) {
				if (bit_test(spec_threads, i))
					pfirst++;
			};
		}
	}

	for (i = pfirst; i <= plast && i < nobj ; i++) {
		obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
		_add_hwloc_cpuset(hwtype, req_hwtype, obj, taskid,
			    bind_verbose, cpuset);
	}

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	return XCGROUP_SUCCESS;
}
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t  s_ix;		/* socket index */
	uint32_t *c_ixc;	/* core index by socket (current taskid) */
	uint32_t *c_ixn;	/* core index by socket (next taskid) */
	uint32_t *t_ix;		/* thread index by core by socket */
	uint32_t npus, ncores, nsockets;
	uint32_t taskid = job->envtp->localid;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t obj_idxs[3], nthreads, cps,
		 tpc, i, j, sock_loop, ntskip, npdist;;
	bool core_cyclic, core_fcyclic, sock_fcyclic;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_CORE);
	nthreads = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_PU);
	cps = ncores/nsockets;
	tpc = nthreads/ncores;

	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
		SLURM_DIST_SOCKCFULL ? true : false;
	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECYCLIC ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	if (bind_verbose) {
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=0x%x)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	}

	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	t_ix = xmalloc(ncores * sizeof(uint32_t));
	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
	c_ixn = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)){
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = ncores / nsockets;
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= ntskip && i < npus; i++) {
				if (bit_test(spec_threads, i))
					ntskip++;
			};
		}
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = j = s_ix = sock_loop = 0;
	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((s_ix < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, s_ix,
				hwtype, c_ixc[s_ix]);
			if (obj != NULL) {
				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
									>= 0) {
					/* granularity is thread */
					obj_idxs[0]=s_ix;
					obj_idxs[1]=c_ixc[s_ix];
					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
					obj = hwloc_get_obj_below_array_by_type(
						topology, 3, obj_types, obj_idxs);
					if (obj != NULL) {
						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
						j++;
						if (i == ntskip)
							_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
						if (j < npdist) {
							if (core_cyclic) {
								c_ixn[s_ix] =
								c_ixc[s_ix] + 1;
							} else if (core_fcyclic){
								c_ixc[s_ix]++;
								c_ixn[s_ix] =
								c_ixc[s_ix];
							}
							if (sock_fcyclic)
								s_ix++;
						}
					} else {
						c_ixc[s_ix]++;
						if (c_ixc[s_ix] == cps)
							s_ix++;
					}
				} else {
					/* granularity is core or larger */
					c_ixc[s_ix]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
						  	bind_verbose, cpuset);
					if ((j < npdist) && (sock_fcyclic))
						s_ix++;
				}
			} else
				s_ix++;
		}
		/* if it succeeds, switch to the next task, starting
		 * with the next available socket, otherwise, loop back
		 * from the first socket trying to find available slots. */
		if (j == npdist) {
			i++;
			j = 0;
			s_ix++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			s_ix = 0;
		}
	}
	xfree(t_ix);
	xfree(c_ixc);
	xfree(c_ixn);

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	/* should never happen in normal scenario */
	if (sock_loop > npdist) {
		error("task/cgroup: task[%u] infinite loop broken while trying "
		      "to provision compute elements using %s", taskid,
		      format_task_dist_states(job->task_dist));
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t *obj_idx;
	uint32_t i, j, sock_idx, sock_loop, ntskip, npdist, nsockets;
	uint32_t taskid = job->envtp->localid;

	if (bind_verbose)
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=%u)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	obj_idx = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = 0; j = 0;
	sock_idx = 0;
	sock_loop = 0;
	while (i < ntskip + 1 && sock_loop < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((sock_idx < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				j++;
				if (i == ntskip)
					_add_hwloc_cpuset(hwtype, req_hwtype,
							  obj, taskid,
							  bind_verbose, cpuset);
				if ((j < npdist) &&
				    (((job->task_dist & SLURM_DIST_STATE_BASE) ==
				      SLURM_DIST_CYCLIC_CFULL) ||
				     ((job->task_dist & SLURM_DIST_STATE_BASE) ==
				      SLURM_DIST_BLOCK_CFULL)))
					sock_idx++;
			} else {
				sock_idx++;
			}
		}
		/* if it succeed, switch to the next task, starting
		   with the next available socket, otherwise, loop back
		   from the first socket trying to find available slots. */
		if (j == npdist) {
			i++; j = 0;
			sock_idx++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			sock_idx = 0;
		}
	}

	xfree(obj_idx);

	/* should never happened in normal scenario */
	if (sock_loop > npdist) {
		error("task/cgroup: task[%u] infinite loop broken while trying"
		      "to provision compute elements using %s", taskid,
		      format_task_dist_states(job->task_dist));
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
Exemple #7
0
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t  s_ix;		/* socket index */
	uint32_t *c_ixc;	/* core index by socket (current taskid) */
	uint32_t *c_ixn;	/* core index by socket (next taskid) */
	uint32_t *t_ix;		/* thread index by core by socket */
	uint16_t npus = 0, nboards = 0, nthreads = 0, ncores = 0, nsockets = 0;
	uint32_t taskid = job->envtp->localid;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
	bool core_cyclic, core_fcyclic, sock_fcyclic;
	bool hwloc_success = true;

	/*
	 * We can't trust the slurmd_conf_t *conf here as we need actual
	 * hardware instead of whatever is possibly configured.  So we need to
	 * look it up again.
	 */
	if (get_cpuinfo(&npus, &nboards, &nsockets, &ncores, &nthreads,
			NULL, NULL, NULL) != SLURM_SUCCESS) {
		/*
		 * Fall back to use allocated resources, but this may result
		 * in incorrect layout due to a uneven task distribution
		 * (e.g. 4 cores on socket 0 and 3 cores on socket 1)
		 */
		nsockets = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_SOCKET);
		ncores = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_CORE);
		nthreads = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_PU);
		npus = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							   HWLOC_OBJ_PU);
	} else {
		/* Translate cores-per-socket to total core count, etc. */
		nsockets *= nboards;
		ncores *= nsockets;
		nthreads *= ncores;
	}

	if ((nsockets == 0) || (ncores == 0))
		return XCGROUP_ERROR;
	cps = (ncores + nsockets - 1) / nsockets;
	tpc = (nthreads + ncores - 1) / ncores;

	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
		SLURM_DIST_SOCKCFULL ? true : false;
	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECYCLIC ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	if (bind_verbose) {
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=0x%x)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	}

	t_ix = xmalloc(ncores * sizeof(uint32_t));
	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
	c_ixn = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}
	if ((job->job_core_spec != NO_VAL16) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = (ncores + nsockets - 1) / nsockets;
		int threads = (npus + cores - 1) / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= ntskip && i < npus; i++) {
				if (bit_test(spec_threads, i))
					ntskip++;
			};
		}
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = j = s_ix = sock_loop = 0;
	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((s_ix < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, s_ix,
				hwtype, c_ixc[s_ix]);
			if ((obj == NULL) && (s_ix == 0) && (c_ixc[s_ix] == 0))
				hwloc_success = false;	/* Complete failure */
			if ((obj != NULL) &&
			    (hwloc_bitmap_first(obj->allowed_cpuset) != -1)) {
				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
									>= 0) {
					/* granularity is thread */
					obj_idxs[0]=s_ix;
					obj_idxs[1]=c_ixc[s_ix];
					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
					obj = hwloc_get_obj_below_array_by_type(
						topology, 3, obj_types, obj_idxs);
					if ((obj != NULL) &&
					    (hwloc_bitmap_first(
					     obj->allowed_cpuset) != -1)) {
						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
						j++;
						if (i == ntskip)
							_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
						if (j < npdist) {
							if (core_cyclic) {
								c_ixn[s_ix] =
								c_ixc[s_ix] + 1;
							} else if (core_fcyclic){
								c_ixc[s_ix]++;
								c_ixn[s_ix] =
								c_ixc[s_ix];
							}
							if (sock_fcyclic)
								s_ix++;
						}
					} else {
						c_ixc[s_ix]++;
						if (c_ixc[s_ix] == cps)
							s_ix++;
					}
				} else {
					/* granularity is core or larger */
					c_ixc[s_ix]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
						  	bind_verbose, cpuset);
					if ((j < npdist) && (sock_fcyclic))
						s_ix++;
				}
			} else
				s_ix++;
		}
		/* if it succeeds, switch to the next task, starting
		 * with the next available socket, otherwise, loop back
		 * from the first socket trying to find available slots. */
		if (j == npdist) {
			i++;
			j = 0;
			s_ix++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			s_ix = 0;
		}
	}
	xfree(t_ix);
	xfree(c_ixc);
	xfree(c_ixn);

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	/* should never happen in normal scenario */
	if ((sock_loop > npdist) && !hwloc_success) {
		/* hwloc_get_obj_below_by_type() fails if no CPU set
		 * configured, see hwloc documentation for details */
		error("task/cgroup: hwloc_get_obj_below_by_type() failing, "
		      "task/affinity plugin may be required to address bug "
		      "fixed in HWLOC version 1.11.5");
		return XCGROUP_ERROR;
	} else if (sock_loop > npdist) {
		char buf[128] = "";
		hwloc_bitmap_snprintf(buf, sizeof(buf), cpuset);
		error("task/cgroup: task[%u] infinite loop broken while trying "
		      "to provision compute elements using %s (bitmap:%s)",
		      taskid, format_task_dist_states(job->task_dist), buf);
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
/****** shepherd_binding/binding_explicit() *****************************************
*  NAME
*     binding_explicit() -- Binds current process to specified CPU cores. 
*
*  SYNOPSIS
*     bool binding_explicit(int* list_of_cores, int camount, int* 
*     list_of_sockets, int samount) 
*
*  FUNCTION
*     Binds the current process to the cores specified by a <socket>,<core>
*     tuple. The tuple is given by a list of sockets and a list of cores. 
*     The elements on the same position of these lists are reflecting 
*     a tuple. Therefore the length of the lists must be the same.
*
*  INPUTS
*     int* list_of_sockets - List of sockets in the same order as list of cores. 
*     int samount          - Length of the list of sockets. 
*     int* list_of_cores   - List of cores in the same order as list of sockets. 
*     int camount          - Length of the list of cores. 
*     int type             - Type of binding ( set | env | pe ).
*
*  RESULT
*     bool - true when the current process was bound as specified with the
*            input parameter
*
*  NOTES
*     MT-NOTE: binding_explicit() is not MT safe 
*
*******************************************************************************/
static bool binding_explicit(const int* list_of_sockets, const int samount, 
   const int* list_of_cores, const int camount, const binding_type_t type)
{
   /* return value: successful bound or not */ 
   bool bound = false;

#if HAVE_HWLOC
   /* check if we have exactly the same number of sockets as cores */
   if (camount != samount) {
      shepherd_trace("binding_explicit: bug: number of sockets != number of cores");
      return false;
   }

   if (list_of_sockets == NULL || list_of_cores == NULL) {
      shepherd_trace("binding_explicit: wrong input values");
   }   
   
   if (has_core_binding() == true) {
      
      if (has_topology_information()) {
         /* bitmask for processors to turn on and off */
         hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();

         /* processor id counter */
         int pr_id_ctr;

         /* Fetch for each socket,core tuple the processor id. 
            If this is not possible for one do nothing and return false. */ 

         /* go through all socket,core tuples and get the processor id */
         for (pr_id_ctr = 0; pr_id_ctr < camount; pr_id_ctr++) { 
            hwloc_obj_t core =
              hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                          HWLOC_OBJ_SOCKET,
                                          list_of_sockets[pr_id_ctr],
                                          HWLOC_OBJ_CORE,
                                          list_of_cores[pr_id_ctr]);
            if (!core) {
               hwloc_bitmap_free(cpuset);
               return false;
            }
            hwloc_bitmap_or(cpuset, cpuset, core->cpuset);
         }

         if (type == BINDING_TYPE_PE) {
            
            /* rankfile is created */

         } else if (type == BINDING_TYPE_ENV) {
            /* set the environment variable */
            if (create_binding_env(cpuset) == true) {
               shepherd_trace("binding_explicit: SGE_BINDING env var created");
            } else {
               shepherd_trace("binding_explicit: problems while creating SGE_BINDING env");
            }
         } else {
            /* do the core binding for the current process with the mask */
            if (bind_process_to_mask(cpuset) == true) {
               /* there was an error while binding */ 
               bound = true;
            } else {
               /* couldn't be bound return false */
               shepherd_trace("binding_explicit: bind_process_to_mask was not successful");
            }   
         }

         hwloc_bitmap_free(cpuset);
         /* Fixme:  Maybe free topology at this stage, but it probably
            doesn't use a significant amount of space.  */
      } else {
         /* has no topology information */
         shepherd_trace("binding_explicit: no topology information");
      }  

   } else {
      /* has no core binding ability */
      shepherd_trace("binding_explicit: host does not support core binding");
   }   
#endif  /* HAVE_HWLOC */
   return bound;
}
/****** shepherd_binding/binding_set_striding() *************************************
*  NAME
*     binding_set_striding() -- Binds current process to cores.
*
*  SYNOPSIS
*     bool binding_set_striding(int first_socket, int first_core, int
*     number_of_cores, int offset, int stepsize)
*
*  FUNCTION
*     Performs a core binding for the calling process according to the 
*     'striding' strategy. The first core used is specified by first_socket
*     (beginning with 0) and first_core (beginning with 0). If first_core is 
*     greater than available cores on first_socket, the next socket is examined 
*     and first_core is reduced by the skipped cores. If the first_core could 
*     not be found on system (because it was to high) no binding will be done.
*     
*     If the first core was choosen the next one is defined by the step size 'n' 
*     which is incremented to the first core found. If the socket has not the 
*     core (because it was the last core of the socket for example) the next 
*     socket is examined.
*
*     If the system is out of cores and there are still some cores to select 
*     (because of the number_of_cores parameter) no core binding will be performed.
*    
*  INPUTS
*     int first_socket    - first socket to begin with  
*     int first_core      - first core to start with  
*     int number_of_cores - total number of cores to be used
*     int offset          - core offset for first core (increments first core used) 
*     int stepsize        - step size
*     int type            - type of binding (set or env or pe)
*
*  RESULT
*     bool - Returns true if the binding was performed, otherwise false.
*
*  NOTES
*     MT-NOTE: binding_set_striding() is MT safe 
*
*******************************************************************************/
static bool
binding_set_striding(int first_socket, int first_core, int number_of_cores,
                          int offset, int stepsize, const binding_type_t type)
{
   /* n := take every n-th core */ 
   bool bound = false;

#if HAVE_HWLOC
   dstring error = DSTRING_INIT;

   if (has_core_binding() == true) {

      sge_dstring_free(&error);

         /* bitmask for processors to turn on and off */
         hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();

         /* when library offers architecture: 
            - get virtual processor ids in the following manner:
              * on socket "first_socket" choose core number "first_core + offset"
              * then add n: if core is not available go to next socket
              * ...
         */
         if (has_topology_information()) {
            /* number of cores set in processor binding mask */
            int cores_set = 0;
            /* next socket to use */
            int next_socket = first_socket;
            /* next core to use */
            int next_core = first_core + offset;
            /* maximal number of sockets on this system */
            int max_number_of_sockets = get_number_of_sockets();
            hwloc_obj_t core;
            
            /* check if we are already out of range */
            if (next_socket >= max_number_of_sockets) {
               shepherd_trace("binding_set_striding: already out of sockets");
               hwloc_bitmap_free(cpuset);
               return false;
            }   

            while (get_number_of_cores(next_socket) <= next_core) {
               /* move on to next socket - could be that we have to deal only with cores 
                  instead of <socket><core> tuples */
               next_core -= get_number_of_cores(next_socket);
               next_socket++;
               if (next_socket >= max_number_of_sockets) {
                  /* we are out of sockets - we do nothing */
                  shepherd_trace("binding_set_striding: first core: out of sockets");
                  hwloc_bitmap_free(cpuset);
                  return false;
               }
            }  
            core = hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                               HWLOC_OBJ_SOCKET, next_socket,
                                               HWLOC_OBJ_CORE, next_core);
            hwloc_bitmap_or(cpuset, cpuset, core->cpuset);
            
            /* collect the rest of the processor ids */ 
            for (cores_set = 1; cores_set < number_of_cores; cores_set++) {
               /* calculate next_core number */ 
               next_core += stepsize;
               
               /* check if we are already out of range */
               if (next_socket >= max_number_of_sockets) {
                  shepherd_trace("binding_set_striding: out of sockets");
                  hwloc_bitmap_free(cpuset);
                  return false;
               }   

               while (get_number_of_cores(next_socket) <= next_core) {
                  /* move on to next socket - could be that we have to deal only with cores 
                     instead of <socket><core> tuples */
                  next_core -= get_number_of_cores(next_socket);
                  next_socket++;
                  if (next_socket >= max_number_of_sockets) {
                     /* we are out of sockets - we do nothing */
                     shepherd_trace("binding_set_striding: out of sockets!");
                     hwloc_bitmap_free(cpuset);
                     return false;
                  }
                  core = hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                                     HWLOC_OBJ_SOCKET,
                                                     next_socket,
                                                     HWLOC_OBJ_CORE,
                                                     next_core);
                  hwloc_bitmap_or(cpuset, cpuset, core->cpuset);
               }    
                
            } /* collecting processor ids */
           
            if (type == BINDING_TYPE_PE) {
            
               /* rankfile is created: do nothing */

            } else if (type == BINDING_TYPE_ENV) {

               /* set the environment variable */
               if (create_binding_env(cpuset) == true) {
                  shepherd_trace("binding_set_striding: SGE_BINDING env var created");
               } else {
                  shepherd_trace("binding_set_striding: problems while creating SGE_BINDING env");
               }

            } else {
               
               /* bind process to mask */ 
               if (bind_process_to_mask(cpuset) == true) {
                  /* there was an error while binding */ 
                  bound = true;
               }
            }
         
            hwloc_bitmap_free(cpuset);
            
         } else {
            /* setting bitmask without topology information which could 
               not be right? */
            shepherd_trace("binding_set_striding: bitmask without topology information");
            return false;
         }

   } else {
      /* has no core binding feature */
      sge_dstring_free(&error);
      
      return false;
   }
   
#endif  /* HAVE_HWLOC */
   return bound;
}
/****** shepherd_binding/binding_set_linear() ***************************************
*  NAME
*     binding_set_linear() -- Bind current process linear to chunk of cores.
*
*  SYNOPSIS
*     bool binding_set_linear(int first_socket, int first_core, int 
*     number_of_cores, int offset)
*
*  FUNCTION
*     Binds current process (shepherd) to a set of cores. All processes 
*     started by the current process inherit the core binding.
*     
*     The core binding is done in a linear manner, that means that 
*     the process is bound to 'number_of_cores' cores using one core
*     after another starting at socket 'first_socket' (usually 0) and 
*     core = 'first_core' (usually 0) + 'offset'. If the core number 
*     is higher than the number of cores which are provided by socket 
*     'first_socket' then the next socket is taken (the core number 
*      defines how many cores are skiped).
*
*  INPUTS
*     int first_socket    - The first socket (starting at 0) to bind to. 
*     int first_core      - The first core to bind. 
*     int number_of_cores - The number_of_cores of cores to bind to.
*     int offset          - The user specified core number offset. 
*     binding_type_t type - The type of binding ONLY FOR EXECD ( set | env | pe )
*                           
*  RESULT
*     bool - true if binding for current process was done, false if not
*
*  NOTES
*     MT-NOTE: binding_set_linear() is not MT safe 
*
*******************************************************************************/
static bool binding_set_linear(int first_socket, int first_core,
               int number_of_cores, int offset, const binding_type_t type)
{

#if HAVE_HWLOC
   /* sets bitmask in a linear manner        */ 
   /* first core is on exclusive host 0      */ 
   /* first core could be set from scheduler */ 
   /* offset is the first core to start with (makes sense only with
      exclusive host) */
   dstring error = DSTRING_INIT;

   if (has_core_binding() == true) {
      /* bitmask for processors to turn on and off */
      hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
         
      if (has_topology_information()) {
         /* number of cores set in processor binding mask */
         int cores_set;
         /* next socket to use */
         int next_socket = first_socket;
         /* the number of cores of the next socket */
         int socket_number_of_cores;
         /* next core to use */
         int next_core = first_core + offset;
         /* maximal number of sockets on this system */
         int max_number_of_sockets = get_number_of_sockets();
         hwloc_obj_t this_core;

         /* strategy: go to the first_socket and the first_core + offset and 
            fill up socket and go to the next one. */ 
               
         /* TODO maybe better to search for using a core exclusively? */
            
         while (get_number_of_cores(next_socket) <= next_core) {
            /* TODO which kind of warning when first socket does not
               offer this? */
            /* move on to next socket - could be that we have to deal
               only with cores instead of <socket><core> tuples */
            next_core -= get_number_of_cores(next_socket);
            next_socket++;
            if (next_socket >= max_number_of_sockets) {
               /* we are out of sockets - we do nothing */
               hwloc_bitmap_free(cpuset);
               return false;
            }
         }  
         this_core =
            hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                        HWLOC_OBJ_SOCKET, next_socket,
                                        HWLOC_OBJ_CORE, next_core);
         hwloc_bitmap_or(cpuset, cpuset, this_core->cpuset);

         /* collect the other processor ids with the strategy */
         for (cores_set = 1; cores_set < number_of_cores; cores_set++) {
            next_core++;
            /* jump to next socket when it is needed */
            /* maybe the next socket could offer 0 cores (I can't see when, 
               but just to be sure) */
            while ((socket_number_of_cores = get_number_of_cores(next_socket))
                        <= next_core) {
               next_socket++;
               next_core = next_core - socket_number_of_cores;
               if (next_socket >= max_number_of_sockets) {
                  /* we are out of sockets - we do nothing */
                  hwloc_bitmap_free(cpuset);
                  return false;
               }
            }
            this_core =
               hwloc_get_obj_below_by_type(sge_hwloc_topology,
                                           HWLOC_OBJ_SOCKET, next_socket,
                                           HWLOC_OBJ_CORE, next_core);
            hwloc_bitmap_or(cpuset, cpuset, this_core->cpuset);
         }

         /* check what to do with the processor ids (set, env or pe) */
         if (type == BINDING_TYPE_PE) {
               
            /* is done outside */

         } else if (type == BINDING_TYPE_ENV) {
               
            /* set the environment variable                    */
            /* this does not show up in "environment" file !!! */
            if (create_binding_env(cpuset) == true) {
               shepherd_trace("binding_set_linear: SGE_BINDING env var created");
            } else {
               shepherd_trace("binding_set_linear: problems while creating SGE_BINDING env");
            }
             
         } else {

             /* bind SET process to mask */ 
            if (bind_process_to_mask(cpuset) == false) {
               /* there was an error while binding */ 
               hwloc_bitmap_free(cpuset);
               return false;
            }
         }

         hwloc_bitmap_free(cpuset);

      } else {
            
         /* TODO DG strategy without topology information but with 
            working library? */
         shepherd_trace("binding_set_linear: no information about topology");
         return false;
      }
         

   } else {

      shepherd_trace("binding_set_linear: binding not supported: %s",
                     sge_dstring_get_string(&error));

      sge_dstring_free(&error);
   }
#endif  /* HAVE_HWLOC */
   return true;
}