Пример #1
0
    std::size_t hwloc_topology::extract_node_count(
        hwloc_obj_t parent
      , hwloc_obj_type_t type
      , std::size_t count
        ) const
    { // {{{
        hwloc_obj_t obj;

        {
            scoped_lock lk(topo_mtx);
            obj = hwloc_get_next_child(topo, parent, NULL);
        }

        while (obj)
        {
            if (hwloc_compare_types(type, obj->type) == 0)
            {
                do {
                    ++count;
                    {
                        scoped_lock lk(topo_mtx);
                        obj = hwloc_get_next_child(topo, parent, obj);
                    }
                } while (obj != NULL && hwloc_compare_types(type, obj->type) == 0);
                return count;
            }

            count = extract_node_count(obj, type, count);

            scoped_lock lk(topo_mtx);
            obj = hwloc_get_next_child(topo, parent, obj);
        }

        return count;
    } // }}}
Пример #2
0
    void hwloc_topology::extract_node_mask(
        hwloc_obj_t parent
      , mask_type& mask
        ) const
    { // {{{
        hwloc_obj_t obj;

        {
            scoped_lock lk(topo_mtx);
            obj = hwloc_get_next_child(topo, parent, NULL);
        }

        while (obj)
        {
            if (hwloc_compare_types(HWLOC_OBJ_PU, obj->type) == 0)
            {
                do {
                    mask |= (static_cast<mask_type>(1) << obj->os_index);
                    {
                        scoped_lock lk(topo_mtx);
                        obj = hwloc_get_next_child(topo, parent, obj);
                    }
                } while (obj != NULL &&
                         hwloc_compare_types(HWLOC_OBJ_PU, obj->type) == 0);
                return;
            }

            extract_node_mask(obj, mask);

            scoped_lock lk(topo_mtx);
            obj = hwloc_get_next_child(topo, parent, obj);
        }
    } // }}}
Пример #3
0
int main(void)
{
  hwloc_topology_t topology;
  hwloc_obj_t obj;

  hwloc_topology_init(&topology);
  hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL);
  hwloc_topology_load(topology);

  printf("Found %d bridges\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_BRIDGE));
  obj = NULL;
  while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_BRIDGE);
    /* only host->pci and pci->pci bridge supported so far */
    if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST) {
      assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI);
      printf(" Found host->PCI bridge for domain %04x bus %02x-%02x\n",
	     obj->attr->bridge.downstream.pci.domain,
	     obj->attr->bridge.downstream.pci.secondary_bus,
	     obj->attr->bridge.downstream.pci.subordinate_bus);
    } else {
      assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
      assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI);
      printf(" Found PCI->PCI bridge [%04x:%04x] for domain %04x bus %02x-%02x\n",
	     obj->attr->bridge.upstream.pci.vendor_id,
	     obj->attr->bridge.upstream.pci.device_id,
	     obj->attr->bridge.downstream.pci.domain,
	     obj->attr->bridge.downstream.pci.secondary_bus,
	     obj->attr->bridge.downstream.pci.subordinate_bus);
    }
  }

  printf("Found %d PCI devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE));
  obj = NULL;
  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
    printf(" Found PCI device class %04x vendor %04x model %04x\n",
	   obj->attr->pcidev.class_id, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
  }

  printf("Found %d OS devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_OS_DEVICE));
  obj = NULL;
  while ((obj = hwloc_get_next_osdev(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_OS_DEVICE);
    printf(" Found OS device %s subtype %d\n", obj->name, obj->attr->osdev.type);
  }

  assert(HWLOC_TYPE_DEPTH_BRIDGE == hwloc_get_type_depth(topology, HWLOC_OBJ_BRIDGE));
  assert(HWLOC_TYPE_DEPTH_PCI_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_PCI_DEVICE));
  assert(HWLOC_TYPE_DEPTH_OS_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_OS_DEVICE));
  assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_PCI_DEVICE) < 0);
  assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_OS_DEVICE) < 0);
  assert(hwloc_compare_types(HWLOC_OBJ_PCI_DEVICE, HWLOC_OBJ_OS_DEVICE) < 0);

  hwloc_topology_destroy(topology);

  return 0;
}
Пример #4
0
    std::size_t hwloc_topology::init_node_number(
        std::size_t num_thread, hwloc_obj_type_t type
        )
    { // {{{
        if (std::size_t(-1) == num_thread)
            return std::size_t(-1);

        std::size_t num_pu = num_thread % num_of_pus_;

        {
            hwloc_obj_t obj;

            {
                scoped_lock lk(topo_mtx);
                obj = hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU,
                    static_cast<unsigned>(num_pu));
            }

            while (obj)
            {
                if (hwloc_compare_types(obj->type, type) == 0)
                {
                    // on Windows os_index is always -1
                    if (obj->logical_index == ~0x0u)
                        return static_cast<std::size_t>(obj->os_index);

                    return static_cast<std::size_t>(obj->logical_index);
                }
                obj = obj->parent;
            }
        }
        return 0;
    } // }}}
Пример #5
0
/*
 * Distribute cpus to the task using block distribution
 */
static int _task_cgroup_cpuset_dist_block(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, uint32_t nobj,
	slurmd_job_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t i, pfirst,plast;
	uint32_t taskid = job->envtp->localid;
	int hwdepth;

	if (bind_verbose)
		info("task/cgroup: task[%u] using block distribution, "
		     "task_dist %u", taskid, job->task_dist);
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		pfirst = taskid *  job->cpus_per_task ;
		plast = pfirst + job->cpus_per_task - 1;
	} else {
		/* sockets or ldoms granularity */
		pfirst = taskid;
		plast = pfirst;
	}
	hwdepth = hwloc_get_type_depth(topology,hwtype);
	for (i = pfirst; i <= plast && i < nobj ; i++) {
		obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
		_add_cpuset(hwtype, req_hwtype, obj, taskid, bind_verbose,
			    cpuset);
	}
	return XCGROUP_SUCCESS;
}
Пример #6
0
static void _add_hwloc_cpuset(
	hwloc_obj_type_t hwtype, hwloc_obj_type_t req_hwtype,
	hwloc_obj_t obj, uint32_t taskid,  int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	struct hwloc_obj *pobj;

	/* if requested binding overlaps the granularity */
	/* use the ancestor cpuset instead of the object one */
	if (hwloc_compare_types(hwtype, req_hwtype) > 0) {

		/* Get the parent object of req_hwtype or the */
		/* one just above if not found (meaning of >0)*/
		/* (useful for ldoms binding with !NUMA nodes)*/
		pobj = obj->parent;
		while (pobj != NULL &&
			hwloc_compare_types(pobj->type, req_hwtype) > 0)
			pobj = pobj->parent;

		if (pobj != NULL) {
			if (bind_verbose)
				info("task/cgroup: task[%u] higher level %s "
				     "found", taskid,
				     hwloc_obj_type_string(pobj->type));
			hwloc_bitmap_or(cpuset, cpuset, pobj->allowed_cpuset);
		} else {
			/* should not be executed */
			if (bind_verbose)
				info("task/cgroup: task[%u] no higher level "
				     "found", taskid);
			hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset);
		}

	} else {
		hwloc_bitmap_or(cpuset, cpuset, obj->allowed_cpuset);
	}
}
Пример #7
0
static int rmaps_lama_hwloc_compare_subtrees(hwloc_obj_t left, hwloc_obj_t right)
{
    int i, ret;

    /*
     * Check Types
     */
    if( 0 != (ret = hwloc_compare_types(left->type, right->type)) ) {
        return ret;
    }

    /*
     * Check 'arity' at this level
     */
    if( left->arity > right->arity ) {
        return -1;
    }
    else if( left->arity < right->arity ) {
        return 1;
    }

    /*
     * Check all subtrees
     */
    for(i = 0; i < (int)left->arity; ++i ) {
        if( 0 != (ret = rmaps_lama_hwloc_compare_subtrees(left->children[i],
                                                          right->children[i])) ) {
            return ret;
        }
    }

    /*
     * Subtree is the same if we get here
     */
    return 0;
}
Пример #8
0
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	uint32_t i;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;

	uint32_t pfirst,plast;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus = jntasks * job->cpus_per_task;
	pid_t    pid = job->envtp->task_pid;

	cpu_bind_type_t bind_type;
	int verbose;

	hwloc_topology_t topology;
#if HWLOC_API_VERSION <= 0x00010000
	hwloc_cpuset_t cpuset,ct;
#else
	hwloc_bitmap_t cpuset,ct;
#endif
	hwloc_obj_t obj;
	struct hwloc_obj *pobj;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int hwdepth;

	size_t tssize;
	cpu_set_t ts;

	bind_type = job->cpu_bind_type ;
	if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
	    bind_type & CPU_BIND_VERBOSE)
		verbose = 1 ;

	if (bind_type & CPU_BIND_NONE) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = HWLOC_OBJ_SOCKET;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
#if HWLOC_API_VERSION <= 0x00010000
	cpuset = hwloc_cpuset_alloc() ;
#else
	cpuset = hwloc_bitmap_alloc() ;
#endif

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	hwloc_topology_load(topology);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);
	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	     bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = HWLOC_OBJ_SOCKET;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	     nsockets >= nldoms &&
	     bind_type & CPU_BIND_TO_LDOMS) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * Perform a block binding on the detected object respecting the
	 * granularity.
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {

		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid,hwloc_obj_type_string(hwtype));

	} else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
		    jnpus > nobj) {

		info("task/cgroup: task[%u] not enough %s objects, disabling "
		     "affinity",taskid,hwloc_obj_type_string(hwtype));

	} else {

		if (verbose) {
			info("task/cgroup: task[%u] using %s granularity",
			     taskid,hwloc_obj_type_string(hwtype));
		}
		if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
			/* cores or threads granularity */
			pfirst = taskid *  job->cpus_per_task ;
			plast = pfirst + job->cpus_per_task - 1;
		} else {
			/* sockets or ldoms granularity */
			pfirst = taskid;
			plast = pfirst;
		}

		hwdepth = hwloc_get_type_depth(topology,hwtype);
		for (i = pfirst; i <= plast && i < nobj ; i++) {
			obj = hwloc_get_obj_by_depth(topology,hwdepth,(int)i);

			/* if requested binding overlap the granularity */
			/* use the ancestor cpuset instead of the object one */
			if (hwloc_compare_types(hwtype,req_hwtype) > 0) {

				/* Get the parent object of req_hwtype or the */
				/* one just above if not found (meaning of >0)*/
				/* (useful for ldoms binding with !NUMA nodes)*/
				pobj = obj->parent;
				while (pobj != NULL &&
					hwloc_compare_types(pobj->type,
							    req_hwtype) > 0)
					pobj = pobj->parent;

				if (pobj != NULL) {
					if (verbose)
						info("task/cgroup: task[%u] "
						     "higher level %s found",
						     taskid,
						     hwloc_obj_type_string(
							     pobj->type));
#if HWLOC_API_VERSION <= 0x00010000
					ct = hwloc_cpuset_dup(pobj->
							      allowed_cpuset);
					hwloc_cpuset_or(cpuset,cpuset,ct);
					hwloc_cpuset_free(ct);
#else
					ct = hwloc_bitmap_dup(pobj->
							      allowed_cpuset);
					hwloc_bitmap_or(cpuset,cpuset,ct);
					hwloc_bitmap_free(ct);
#endif
				} else {
					/* should not be executed */
					if (verbose)
						info("task/cgroup: task[%u] "
						     "no higher level found",
						     taskid);
#if HWLOC_API_VERSION <= 0x00010000
					ct = hwloc_cpuset_dup(obj->
							      allowed_cpuset);
					hwloc_cpuset_or(cpuset,cpuset,ct);
					hwloc_cpuset_free(ct);
#else
					ct = hwloc_bitmap_dup(obj->
							      allowed_cpuset);
					hwloc_bitmap_or(cpuset,cpuset,ct);
					hwloc_bitmap_free(ct);
#endif
				}

			} else {
#if HWLOC_API_VERSION <= 0x00010000
				ct = hwloc_cpuset_dup(obj->allowed_cpuset);
				hwloc_cpuset_or(cpuset,cpuset,ct);
				hwloc_cpuset_free(ct);
#else
				ct = hwloc_bitmap_dup(obj->allowed_cpuset);
				hwloc_bitmap_or(cpuset,cpuset,ct);
				hwloc_bitmap_free(ct);
#endif
			}
		}

		char *str;
#if HWLOC_API_VERSION <= 0x00010000
		hwloc_cpuset_asprintf(&str,cpuset);
#else
		hwloc_bitmap_asprintf(&str,cpuset);
#endif
		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
							  &ts,tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if (sched_setaffinity(pid,tssize,&ts)) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'",taskid,str);
				fstatus = SLURM_ERROR;
			} else if (verbose) {
				info("task/cgroup: task[%u] taskset '%s' is set"
				     ,taskid,str);
			}
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);

	}

	/* Destroy hwloc objects */
#if HWLOC_API_VERSION <= 0x00010000
	hwloc_cpuset_free(cpuset);
#else
	hwloc_bitmap_free(cpuset);
#endif
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
Пример #9
0
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(slurmd_job_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	hwloc_obj_type_t socket_or_node;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus = jntasks * job->cpus_per_task;
	pid_t    pid = job->envtp->task_pid;

	cpu_bind_type_t bind_type;
	int bind_verbose = 0;

	hwloc_topology_t topology;
	hwloc_bitmap_t cpuset;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;

	size_t tssize;
	cpu_set_t ts;

	bind_type = job->cpu_bind_type ;
	if (conf->task_plugin_param & CPU_BIND_VERBOSE ||
	    bind_type & CPU_BIND_VERBOSE)
		bind_verbose = 1 ;

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);

	cpuset = hwloc_bitmap_alloc();

	hwloc_topology_load(topology);
	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
		/* One socket contains multiple NUMA-nodes
		 * like AMD Opteron 6000 series etc.
		 * In such case, use NUMA-node instead of socket. */
		socket_or_node = HWLOC_OBJ_NODE;
	} else {
		socket_or_node = HWLOC_OBJ_SOCKET;
	}

	if (bind_type & CPU_BIND_NONE) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = socket_or_node;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (bind_verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       socket_or_node);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);

	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if (npus >= jnpus || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	    bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = socket_or_node;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	    nsockets >= nldoms &&
	    bind_type & CPU_BIND_TO_LDOMS) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * Bind the detected object to the taskid, respecting the
	 * granularity, using the designated or default distribution
	 * method (block or cyclic).
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype,HWLOC_OBJ_MACHINE) == 0) {

		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid,hwloc_obj_type_string(hwtype));

	} else if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0 &&
		   jnpus > nobj) {

		info("task/cgroup: task[%u] not enough %s objects, disabling "
		     "affinity",taskid,hwloc_obj_type_string(hwtype));

	} else {
		char *str;

		if (bind_verbose) {
			info("task/cgroup: task[%u] using %s granularity",
			     taskid,hwloc_obj_type_string(hwtype));
		}

		/* There are two "distributions,"  controlled by the
		 * -m option of srun and friends. The first is the
		 * distribution of tasks to nodes.  The second is the
		 * distribution of allocated cpus to tasks for
		 * binding.  This code is handling the second
		 * distribution.  Here's how the values get set, based
		 * on the value of -m
		 *
		 * SLURM_DIST_CYCLIC = srun -m cyclic
		 * SLURM_DIST_BLOCK = srun -m block
		 * SLURM_DIST_CYCLIC_CYCLIC = srun -m cyclic:cyclic
		 * SLURM_DIST_BLOCK_CYCLIC = srun -m block:cyclic
		 *
		 * In the first two cases, the user only specified the
		 * first distribution.  The second distribution
		 * defaults to cyclic.  In the second two cases, the
		 * user explicitly requested a second distribution of
		 * cyclic.  So all these four cases correspond to a
		 * second distribution of cyclic.   So we want to call
		 * _task_cgroup_cpuset_dist_cyclic.
		 *
		 * If the user explicitly specifies a second
		 * distribution of block, or if
		 * CR_CORE_DEFAULT_DIST_BLOCK is configured and the
		 * user does not explicitly specify a second
		 * distribution of cyclic, the second distribution is
		 * block, and we need to call
		 * _task_cgroup_cpuset_dist_block. In these cases,
		 * task_dist would be set to SLURM_DIST_CYCLIC_BLOCK
		 * or SLURM_DIST_BLOCK_BLOCK.
		 *
		 * You can see the equivalent code for the
		 * task/affinity plugin in
		 * src/plugins/task/affinity/dist_tasks.c, around line 384.
		 */
		switch (job->task_dist) {
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC_CYCLIC:
		case SLURM_DIST_BLOCK_CYCLIC:
			_task_cgroup_cpuset_dist_cyclic(
				topology, hwtype, req_hwtype,
				job, bind_verbose, cpuset);
			break;
		default:
			_task_cgroup_cpuset_dist_block(
				topology, hwtype, req_hwtype,
				nobj, job, bind_verbose, cpuset);
		}

		hwloc_bitmap_asprintf(&str, cpuset);

		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology,cpuset,
							 &ts,tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if (sched_setaffinity(pid,tssize,&ts)) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'",taskid,str);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] taskset '%s' is set"
				     ,taskid,str);
			}
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);
	}

	/* Destroy hwloc objects */
	hwloc_bitmap_free(cpuset);

	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
Пример #10
0
/*
 * Distribute cpus to the task using cyclic distribution across sockets
 */
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, slurmd_job_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t *obj_idx;
	uint32_t i, sock_idx, npskip, npdist, nsockets;
	uint32_t taskid = job->envtp->localid;

	if (bind_verbose)
		info("task/cgroup: task[%u] using cyclic distribution, "
		     "task_dist %u", taskid, job->task_dist);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	obj_idx = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype,HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		npskip = taskid * job->cpus_per_task;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		npskip = taskid;
		npdist = 1;
	}

	/* skip objs for lower taskids */
	i = 0;
	sock_idx = 0;
	while (i < npskip) {
		while ((sock_idx < nsockets) && (i < npskip)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				i++;
			}
			sock_idx++;
		}
		if (i < npskip)
			sock_idx = 0;
	}

	/* distribute objs cyclically across sockets */
	i = npdist;
	while (i > 0) {
		while ((sock_idx < nsockets) && (i > 0)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				_add_cpuset(hwtype, req_hwtype, obj, taskid,
					    bind_verbose, cpuset);
				i--;
			}
			sock_idx++;
		}
		sock_idx = 0;
	}
	xfree(obj_idx);
	return XCGROUP_SUCCESS;
}
Пример #11
0
static int _task_cgroup_cpuset_dist_block(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, uint32_t nobj,
	stepd_step_rec_t *job, int bind_verbose, hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t core_loop, ntskip, npdist;
	uint32_t i, j, pfirst, plast;
	uint32_t taskid = job->envtp->localid;
	int hwdepth;
	uint32_t npus, ncores, nsockets;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t *thread_idx;
	uint32_t core_idx;
	bool core_fcyclic, core_block;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	core_block = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_COREBLOCK ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	thread_idx = xmalloc(ncores * sizeof(uint32_t));

	if (bind_verbose) {
		info("task/cgroup: task[%u] using block distribution, "
		     "task_dist 0x%x", taskid, job->task_dist);
	}

	if ((hwloc_compare_types(hwtype, HWLOC_OBJ_PU) == 0) && !core_block) {
		thread_idx = xmalloc(ncores * sizeof(uint32_t));
		ntskip = taskid;
		npdist = job->cpus_per_task;

		i = 0; j = 0;
		core_idx = 0;
		core_loop = 0;
		while (i < ntskip + 1 && core_loop < npdist + 1) {
			while ((core_idx < ncores) && (j < npdist)) {
				obj = hwloc_get_obj_below_by_type(
					topology, HWLOC_OBJ_CORE, core_idx,
					hwtype, thread_idx[core_idx]);
				if (obj != NULL) {
					thread_idx[core_idx]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
					if ((j < npdist) && core_fcyclic)
						core_idx++;
				} else {
					core_idx++;
				}
			}
			if (j == npdist) {
				i++; j = 0;
				core_idx++; // no validity check, handled by the while
				core_loop = 0;
			} else {
				core_loop++;
				core_idx = 0;
			}
		}
		xfree(thread_idx);

		/* should never happen in normal scenario */
		if (core_loop > npdist) {
			error("task/cgroup: task[%u] infinite loop broken while "
			      "trying to provision compute elements using %s",
			      taskid, format_task_dist_states(job->task_dist));
			return XCGROUP_ERROR;
		} else
			return XCGROUP_SUCCESS;
	}

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		pfirst = taskid * job->cpus_per_task ;
		plast = pfirst + job->cpus_per_task - 1;
	} else {
		/* sockets or ldoms granularity */
		pfirst = taskid;
		plast = pfirst;
	}

	hwdepth = hwloc_get_type_depth(topology, hwtype);
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)  &&
	    (nsockets != 0)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = MAX(1, (ncores / nsockets));
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= pfirst && i < npus; i++) {
				if (bit_test(spec_threads, i))
					pfirst++;
			};
		}
	}

	for (i = pfirst; i <= plast && i < nobj ; i++) {
		obj = hwloc_get_obj_by_depth(topology, hwdepth, (int)i);
		_add_hwloc_cpuset(hwtype, req_hwtype, obj, taskid,
			    bind_verbose, cpuset);
	}

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	return XCGROUP_SUCCESS;
}
Пример #12
0
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t  s_ix;		/* socket index */
	uint32_t *c_ixc;	/* core index by socket (current taskid) */
	uint32_t *c_ixn;	/* core index by socket (next taskid) */
	uint32_t *t_ix;		/* thread index by core by socket */
	uint32_t npus, ncores, nsockets;
	uint32_t taskid = job->envtp->localid;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;

	uint32_t obj_idxs[3], nthreads, cps,
		 tpc, i, j, sock_loop, ntskip, npdist;;
	bool core_cyclic, core_fcyclic, sock_fcyclic;

	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_CORE);
	nthreads = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_PU);
	cps = ncores/nsockets;
	tpc = nthreads/ncores;

	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
		SLURM_DIST_SOCKCFULL ? true : false;
	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECYCLIC ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	if (bind_verbose) {
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=0x%x)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	}

	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);

	t_ix = xmalloc(ncores * sizeof(uint32_t));
	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
	c_ixn = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)){
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = ncores / nsockets;
		int threads = npus / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= ntskip && i < npus; i++) {
				if (bit_test(spec_threads, i))
					ntskip++;
			};
		}
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = j = s_ix = sock_loop = 0;
	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((s_ix < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, s_ix,
				hwtype, c_ixc[s_ix]);
			if (obj != NULL) {
				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
									>= 0) {
					/* granularity is thread */
					obj_idxs[0]=s_ix;
					obj_idxs[1]=c_ixc[s_ix];
					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
					obj = hwloc_get_obj_below_array_by_type(
						topology, 3, obj_types, obj_idxs);
					if (obj != NULL) {
						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
						j++;
						if (i == ntskip)
							_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
						if (j < npdist) {
							if (core_cyclic) {
								c_ixn[s_ix] =
								c_ixc[s_ix] + 1;
							} else if (core_fcyclic){
								c_ixc[s_ix]++;
								c_ixn[s_ix] =
								c_ixc[s_ix];
							}
							if (sock_fcyclic)
								s_ix++;
						}
					} else {
						c_ixc[s_ix]++;
						if (c_ixc[s_ix] == cps)
							s_ix++;
					}
				} else {
					/* granularity is core or larger */
					c_ixc[s_ix]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
						  	bind_verbose, cpuset);
					if ((j < npdist) && (sock_fcyclic))
						s_ix++;
				}
			} else
				s_ix++;
		}
		/* if it succeeds, switch to the next task, starting
		 * with the next available socket, otherwise, loop back
		 * from the first socket trying to find available slots. */
		if (j == npdist) {
			i++;
			j = 0;
			s_ix++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			s_ix = 0;
		}
	}
	xfree(t_ix);
	xfree(c_ixc);
	xfree(c_ixn);

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	/* should never happen in normal scenario */
	if (sock_loop > npdist) {
		error("task/cgroup: task[%u] infinite loop broken while trying "
		      "to provision compute elements using %s", taskid,
		      format_task_dist_states(job->task_dist));
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
Пример #13
0
/* user to have to play with the cgroup hierarchy to modify it */
extern int task_cgroup_cpuset_set_task_affinity(stepd_step_rec_t *job)
{
	int fstatus = SLURM_ERROR;

#ifndef HAVE_HWLOC

	error("task/cgroup: plugin not compiled with hwloc support, "
	      "skipping affinity.");
	return fstatus;

#else
	char mstr[1 + CPU_SETSIZE / 4];
	cpu_bind_type_t bind_type;
	cpu_set_t ts;
	hwloc_obj_t obj;
	hwloc_obj_type_t socket_or_node;
	hwloc_topology_t topology;
	hwloc_bitmap_t cpuset;
	hwloc_obj_type_t hwtype;
	hwloc_obj_type_t req_hwtype;
	int bind_verbose = 0;
	int rc = SLURM_SUCCESS, match;
	pid_t    pid = job->envtp->task_pid;
	size_t tssize;
	uint32_t nldoms;
	uint32_t nsockets;
	uint32_t ncores;
	uint32_t npus;
	uint32_t nobj;
	uint32_t taskid = job->envtp->localid;
	uint32_t jntasks = job->node_tasks;
	uint32_t jnpus;

	/* Allocate and initialize hwloc objects */
	hwloc_topology_init(&topology);
	hwloc_topology_load(topology);
	cpuset = hwloc_bitmap_alloc();

	int spec_threads = 0;

	if (job->batch) {
		jnpus = job->cpus;
		job->cpus_per_task = job->cpus;
	} else
		jnpus = jntasks * job->cpus_per_task;

	bind_type = job->cpu_bind_type;
	if ((conf->task_plugin_param & CPU_BIND_VERBOSE) ||
	    (bind_type & CPU_BIND_VERBOSE))
		bind_verbose = 1 ;

	if ( hwloc_get_type_depth(topology, HWLOC_OBJ_NODE) >
	     hwloc_get_type_depth(topology, HWLOC_OBJ_SOCKET) ) {
		/* One socket contains multiple NUMA-nodes
		 * like AMD Opteron 6000 series etc.
		 * In such case, use NUMA-node instead of socket. */
		socket_or_node = HWLOC_OBJ_NODE;
	} else {
		socket_or_node = HWLOC_OBJ_SOCKET;
	}

	if (bind_type & CPU_BIND_NONE) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting no affinity",
			     taskid);
		return 0;
	} else if (bind_type & CPU_BIND_TO_THREADS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "thread level binding",taskid);
		req_hwtype = HWLOC_OBJ_PU;
	} else if (bind_type & CPU_BIND_TO_CORES) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "core level binding",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	} else if (bind_type & CPU_BIND_TO_SOCKETS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "socket level binding",taskid);
		req_hwtype = socket_or_node;
	} else if (bind_type & CPU_BIND_TO_LDOMS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "ldom level binding",taskid);
		req_hwtype = HWLOC_OBJ_NODE;
	} else if (bind_type & CPU_BIND_TO_BOARDS) {
		if (bind_verbose)
			info("task/cgroup: task[%u] is requesting "
			     "board level binding",taskid);
		req_hwtype = HWLOC_OBJ_GROUP;
	} else if (bind_type & bind_mode_ldom) {
		req_hwtype = HWLOC_OBJ_NODE;
	} else {
		if (bind_verbose)
			info("task/cgroup: task[%u] using core level binding"
			     " by default",taskid);
		req_hwtype = HWLOC_OBJ_CORE;
	}

	/*
	 * Perform the topology detection. It will only get allowed PUs.
	 * Detect in the same time the granularity to use for binding.
	 * The granularity can be relaxed from threads to cores if enough
	 * cores are available as with hyperthread support, ntasks-per-core
	 * param can let us have access to more threads per core for each
	 * task
	 * Revert back to machine granularity if no finer-grained granularity
	 * matching the request is found. This will result in no affinity
	 * applied.
	 * The detected granularity will be used to find where to best place
	 * the task, then the cpu_bind option will be used to relax the
	 * affinity constraint and use more PUs. (i.e. use a core granularity
	 * to dispatch the tasks across the sockets and then provide access
	 * to each task to the cores of its socket.)
	 */
	npus = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						   HWLOC_OBJ_PU);
	ncores = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_CORE);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       socket_or_node);
	nldoms = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						     HWLOC_OBJ_NODE);
	//info("PU:%d CORE:%d SOCK:%d LDOM:%d", npus, ncores, nsockets, nldoms);

	hwtype = HWLOC_OBJ_MACHINE;
	nobj = 1;
	if ((job->job_core_spec != (uint16_t) NO_VAL) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)) {
		spec_threads = job->job_core_spec & (~CORE_SPEC_THREAD);
	}
	if (npus >= (jnpus + spec_threads) || bind_type & CPU_BIND_TO_THREADS) {
		hwtype = HWLOC_OBJ_PU;
		nobj = npus;
	}
	if (ncores >= jnpus || bind_type & CPU_BIND_TO_CORES) {
		hwtype = HWLOC_OBJ_CORE;
		nobj = ncores;
	}
	if (nsockets >= jntasks &&
	    bind_type & CPU_BIND_TO_SOCKETS) {
		hwtype = socket_or_node;
		nobj = nsockets;
	}
	/*
	 * HWLOC returns all the NUMA nodes available regardless of the
	 * number of underlying sockets available (regardless of the allowed
	 * resources). So there is no guarantee that each ldom will be populated
	 * with usable sockets. So add a simple check that at least ensure that
	 * we have as many sockets as ldoms before moving to ldoms granularity
	 */
	if (nldoms >= jntasks &&
	    nsockets >= nldoms &&
	    bind_type & (CPU_BIND_TO_LDOMS | bind_mode_ldom)) {
		hwtype = HWLOC_OBJ_NODE;
		nobj = nldoms;
	}

	/*
	 * If not enough objects to do the job, revert to no affinity mode
	 */
	if (hwloc_compare_types(hwtype, HWLOC_OBJ_MACHINE) == 0) {
		info("task/cgroup: task[%u] disabling affinity because of %s "
		     "granularity",taskid, hwloc_obj_type_string(hwtype));

	} else if ((hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) &&
		   (nobj < jnpus)) {
		info("task/cgroup: task[%u] not enough %s objects (%d < %d), "
		     "disabling affinity",
		     taskid, hwloc_obj_type_string(hwtype), nobj, jnpus);

	} else if (bind_type & bind_mode) {
		/* Explicit binding mode specified by the user
		 * Bind the taskid in accordance with the specified mode
		 */
		obj = hwloc_get_obj_by_type(topology, HWLOC_OBJ_MACHINE, 0);
		match = hwloc_bitmap_isequal(obj->complete_cpuset,
					     obj->allowed_cpuset);
		if ((job->job_core_spec == (uint16_t) NO_VAL) && !match) {
			info("task/cgroup: entire node must be allocated, "
			     "disabling affinity, task[%u]", taskid);
			fprintf(stderr, "Requested cpu_bind option requires "
				"entire node to be allocated; disabling "
				"affinity\n");
		} else {
			if (bind_verbose) {
				info("task/cgroup: task[%u] is requesting "
				     "explicit binding mode", taskid);
			}
			_get_sched_cpuset(topology, hwtype, req_hwtype, &ts,
					  job);
			tssize = sizeof(cpu_set_t);
			fstatus = SLURM_SUCCESS;
			if (job->job_core_spec != (uint16_t) NO_VAL)
				_validate_mask(taskid, obj, &ts);
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "mask 0x%s", taskid,
				      cpuset_to_str(&ts, mstr));
				error("sched_setaffinity rc = %d", rc);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] mask 0x%s",
				     taskid, cpuset_to_str(&ts, mstr));
			}
			_slurm_chkaffinity(&ts, job, rc);
		}
	} else {
		/* Bind the detected object to the taskid, respecting the
		 * granularity, using the designated or default distribution
		 * method (block or cyclic). */
		char *str;

		if (bind_verbose) {
			info("task/cgroup: task[%u] using %s granularity dist %u",
			     taskid, hwloc_obj_type_string(hwtype),
			     job->task_dist);
		}

		/* See srun man page for detailed information on --distribution
		 * option.
		 *
		 * You can see the equivalent code for the
		 * task/affinity plugin in
		 * src/plugins/task/affinity/dist_tasks.c, around line 368
		 */
		switch (job->task_dist & SLURM_DIST_NODESOCKMASK) {
		case SLURM_DIST_BLOCK_BLOCK:
		case SLURM_DIST_CYCLIC_BLOCK:
		case SLURM_DIST_PLANE:
			/* tasks are distributed in blocks within a plane */
			_task_cgroup_cpuset_dist_block(topology,
				hwtype, req_hwtype,
				nobj, job, bind_verbose, cpuset);
			break;
		case SLURM_DIST_ARBITRARY:
		case SLURM_DIST_BLOCK:
		case SLURM_DIST_CYCLIC:
		case SLURM_DIST_UNKNOWN:
			if (slurm_get_select_type_param()
			    & CR_CORE_DEFAULT_DIST_BLOCK) {
				_task_cgroup_cpuset_dist_block(topology,
					hwtype, req_hwtype,
					nobj, job, bind_verbose, cpuset);
				break;
			}
			/* We want to fall through here if we aren't doing a
			   default dist block.
			*/
		default:
			_task_cgroup_cpuset_dist_cyclic(topology,
				hwtype, req_hwtype,
				job, bind_verbose, cpuset);
			break;
		}

		hwloc_bitmap_asprintf(&str, cpuset);

		tssize = sizeof(cpu_set_t);
		if (hwloc_cpuset_to_glibc_sched_affinity(topology, cpuset,
							 &ts, tssize) == 0) {
			fstatus = SLURM_SUCCESS;
			if ((rc = sched_setaffinity(pid, tssize, &ts))) {
				error("task/cgroup: task[%u] unable to set "
				      "taskset '%s'", taskid, str);
				fstatus = SLURM_ERROR;
			} else if (bind_verbose) {
				info("task/cgroup: task[%u] set taskset '%s'",
				     taskid, str);
			}
			_slurm_chkaffinity(&ts, job, rc);
		} else {
			error("task/cgroup: task[%u] unable to build "
			      "taskset '%s'",taskid,str);
			fstatus = SLURM_ERROR;
		}
		free(str);
	}

	/* Destroy hwloc objects */
	hwloc_bitmap_free(cpuset);
	hwloc_topology_destroy(topology);

	return fstatus;
#endif

}
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t *obj_idx;
	uint32_t i, j, sock_idx, sock_loop, ntskip, npdist, nsockets;
	uint32_t taskid = job->envtp->localid;

	if (bind_verbose)
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=%u)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	nsockets = (uint32_t) hwloc_get_nbobjs_by_type(topology,
						       HWLOC_OBJ_SOCKET);
	obj_idx = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = 0; j = 0;
	sock_idx = 0;
	sock_loop = 0;
	while (i < ntskip + 1 && sock_loop < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((sock_idx < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, sock_idx,
				hwtype, obj_idx[sock_idx]);
			if (obj != NULL) {
				obj_idx[sock_idx]++;
				j++;
				if (i == ntskip)
					_add_hwloc_cpuset(hwtype, req_hwtype,
							  obj, taskid,
							  bind_verbose, cpuset);
				if ((j < npdist) &&
				    (((job->task_dist & SLURM_DIST_STATE_BASE) ==
				      SLURM_DIST_CYCLIC_CFULL) ||
				     ((job->task_dist & SLURM_DIST_STATE_BASE) ==
				      SLURM_DIST_BLOCK_CFULL)))
					sock_idx++;
			} else {
				sock_idx++;
			}
		}
		/* if it succeed, switch to the next task, starting
		   with the next available socket, otherwise, loop back
		   from the first socket trying to find available slots. */
		if (j == npdist) {
			i++; j = 0;
			sock_idx++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			sock_idx = 0;
		}
	}

	xfree(obj_idx);

	/* should never happened in normal scenario */
	if (sock_loop > npdist) {
		error("task/cgroup: task[%u] infinite loop broken while trying"
		      "to provision compute elements using %s", taskid,
		      format_task_dist_states(job->task_dist));
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}
Пример #15
0
int main(void)
{
  hwloc_topology_t topology;
  hwloc_obj_t obj;

  hwloc_topology_init(&topology);
  hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO);
  assert(-1 == hwloc_topology_ignore_type(topology, HWLOC_OBJ_PCI_DEVICE));
  assert(-1 == hwloc_topology_ignore_type_keep_structure(topology, HWLOC_OBJ_BRIDGE));
  assert(-1 == hwloc_topology_ignore_type(topology, HWLOC_OBJ_OS_DEVICE));
  hwloc_topology_load(topology);

  printf("Found %d bridges\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_BRIDGE));
  obj = NULL;
  while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_BRIDGE);
    /* only host->pci and pci->pci bridge supported so far */
    if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST) {
      assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI);
      printf(" Found host->PCI bridge for domain %04x bus %02x-%02x\n",
	     obj->attr->bridge.downstream.pci.domain,
	     obj->attr->bridge.downstream.pci.secondary_bus,
	     obj->attr->bridge.downstream.pci.subordinate_bus);
    } else {
      assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
      assert(obj->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI);
      printf(" Found PCI->PCI bridge [%04x:%04x] for domain %04x bus %02x-%02x\n",
	     obj->attr->bridge.upstream.pci.vendor_id,
	     obj->attr->bridge.upstream.pci.device_id,
	     obj->attr->bridge.downstream.pci.domain,
	     obj->attr->bridge.downstream.pci.secondary_bus,
	     obj->attr->bridge.downstream.pci.subordinate_bus);
    }
  }

  printf("Found %d PCI devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PCI_DEVICE));
  obj = NULL;
  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
    printf(" Found PCI device class %04x vendor %04x model %04x\n",
	   obj->attr->pcidev.class_id, obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
  }

  printf("Found %d OS devices\n", hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_OS_DEVICE));
  obj = NULL;
  while ((obj = hwloc_get_next_osdev(topology, obj)) != NULL) {
    assert(obj->type == HWLOC_OBJ_OS_DEVICE);
    printf(" Found OS device %s subtype %d\n", obj->name, obj->attr->osdev.type);
  }

  assert(HWLOC_TYPE_DEPTH_BRIDGE == hwloc_get_type_depth(topology, HWLOC_OBJ_BRIDGE));
  assert(HWLOC_TYPE_DEPTH_PCI_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_PCI_DEVICE));
  assert(HWLOC_TYPE_DEPTH_OS_DEVICE == hwloc_get_type_depth(topology, HWLOC_OBJ_OS_DEVICE));
  assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_PCI_DEVICE) < 0);
  assert(hwloc_compare_types(HWLOC_OBJ_BRIDGE, HWLOC_OBJ_OS_DEVICE) < 0);
  assert(hwloc_compare_types(HWLOC_OBJ_PCI_DEVICE, HWLOC_OBJ_OS_DEVICE) < 0);

  /* check that hwloc_get_hostbridge_by_pcibus() and hwloc_get_non_io_ancestor_obj work fine */
  obj = NULL;
  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
    assert(hwloc_get_hostbridge_by_pcibus(topology,
					  obj->attr->pcidev.domain,
					  obj->attr->pcidev.bus)->parent
	   == hwloc_get_non_io_ancestor_obj(topology, obj));
  }

  hwloc_topology_destroy(topology);

  return 0;
}
Пример #16
0
static int _task_cgroup_cpuset_dist_cyclic(
	hwloc_topology_t topology, hwloc_obj_type_t hwtype,
	hwloc_obj_type_t req_hwtype, stepd_step_rec_t *job, int bind_verbose,
	hwloc_bitmap_t cpuset)
{
	hwloc_obj_t obj;
	uint32_t  s_ix;		/* socket index */
	uint32_t *c_ixc;	/* core index by socket (current taskid) */
	uint32_t *c_ixn;	/* core index by socket (next taskid) */
	uint32_t *t_ix;		/* thread index by core by socket */
	uint16_t npus = 0, nboards = 0, nthreads = 0, ncores = 0, nsockets = 0;
	uint32_t taskid = job->envtp->localid;
	int spec_thread_cnt = 0;
	bitstr_t *spec_threads = NULL;
	uint32_t obj_idxs[3], cps, tpc, i, j, sock_loop, ntskip, npdist;
	bool core_cyclic, core_fcyclic, sock_fcyclic;
	bool hwloc_success = true;

	/*
	 * We can't trust the slurmd_conf_t *conf here as we need actual
	 * hardware instead of whatever is possibly configured.  So we need to
	 * look it up again.
	 */
	if (get_cpuinfo(&npus, &nboards, &nsockets, &ncores, &nthreads,
			NULL, NULL, NULL) != SLURM_SUCCESS) {
		/*
		 * Fall back to use allocated resources, but this may result
		 * in incorrect layout due to a uneven task distribution
		 * (e.g. 4 cores on socket 0 and 3 cores on socket 1)
		 */
		nsockets = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_SOCKET);
		ncores = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_CORE);
		nthreads = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							HWLOC_OBJ_PU);
		npus = (uint16_t) hwloc_get_nbobjs_by_type(topology,
							   HWLOC_OBJ_PU);
	} else {
		/* Translate cores-per-socket to total core count, etc. */
		nsockets *= nboards;
		ncores *= nsockets;
		nthreads *= ncores;
	}

	if ((nsockets == 0) || (ncores == 0))
		return XCGROUP_ERROR;
	cps = (ncores + nsockets - 1) / nsockets;
	tpc = (nthreads + ncores - 1) / ncores;

	sock_fcyclic = (job->task_dist & SLURM_DIST_SOCKMASK) ==
		SLURM_DIST_SOCKCFULL ? true : false;
	core_cyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECYCLIC ? true : false;
	core_fcyclic = (job->task_dist & SLURM_DIST_COREMASK) ==
		SLURM_DIST_CORECFULL ? true : false;

	if (bind_verbose) {
		info("task/cgroup: task[%u] using %s distribution "
		     "(task_dist=0x%x)", taskid,
		     format_task_dist_states(job->task_dist), job->task_dist);
	}

	t_ix = xmalloc(ncores * sizeof(uint32_t));
	c_ixc = xmalloc(nsockets * sizeof(uint32_t));
	c_ixn = xmalloc(nsockets * sizeof(uint32_t));

	if (hwloc_compare_types(hwtype, HWLOC_OBJ_CORE) >= 0) {
		/* cores or threads granularity */
		ntskip = taskid;
		npdist = job->cpus_per_task;
	} else {
		/* sockets or ldoms granularity */
		ntskip = taskid;
		npdist = 1;
	}
	if ((job->job_core_spec != NO_VAL16) &&
	    (job->job_core_spec &  CORE_SPEC_THREAD)  &&
	    (job->job_core_spec != CORE_SPEC_THREAD)) {
		/* Skip specialized threads as needed */
		int i, t, c, s;
		int cores = (ncores + nsockets - 1) / nsockets;
		int threads = (npus + cores - 1) / cores;
		spec_thread_cnt = job->job_core_spec & (~CORE_SPEC_THREAD);
		spec_threads = bit_alloc(npus);
		for (t = threads - 1;
		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
			for (c = cores - 1;
			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
				for (s = nsockets - 1;
				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
					i = s * cores + c;
					i = (i * threads) + t;
					bit_set(spec_threads, i);
					spec_thread_cnt--;
				}
			}
		}
		if (hwtype == HWLOC_OBJ_PU) {
			for (i = 0; i <= ntskip && i < npus; i++) {
				if (bit_test(spec_threads, i))
					ntskip++;
			};
		}
	}

	/* skip objs for lower taskids, then add them to the
	   current task cpuset. To prevent infinite loop, check
	   that we do not loop more than npdist times around the available
	   sockets, which is the worst scenario we should afford here. */
	i = j = s_ix = sock_loop = 0;
	while (i < ntskip + 1 && (sock_loop/tpc) < npdist + 1) {
		/* fill one or multiple sockets using block mode, unless
		   otherwise stated in the job->task_dist field */
		while ((s_ix < nsockets) && (j < npdist)) {
			obj = hwloc_get_obj_below_by_type(
				topology, HWLOC_OBJ_SOCKET, s_ix,
				hwtype, c_ixc[s_ix]);
			if ((obj == NULL) && (s_ix == 0) && (c_ixc[s_ix] == 0))
				hwloc_success = false;	/* Complete failure */
			if ((obj != NULL) &&
			    (hwloc_bitmap_first(obj->allowed_cpuset) != -1)) {
				if (hwloc_compare_types(hwtype, HWLOC_OBJ_PU)
									>= 0) {
					/* granularity is thread */
					obj_idxs[0]=s_ix;
					obj_idxs[1]=c_ixc[s_ix];
					obj_idxs[2]=t_ix[(s_ix*cps)+c_ixc[s_ix]];
					obj = hwloc_get_obj_below_array_by_type(
						topology, 3, obj_types, obj_idxs);
					if ((obj != NULL) &&
					    (hwloc_bitmap_first(
					     obj->allowed_cpuset) != -1)) {
						t_ix[(s_ix*cps)+c_ixc[s_ix]]++;
						j++;
						if (i == ntskip)
							_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
							bind_verbose, cpuset);
						if (j < npdist) {
							if (core_cyclic) {
								c_ixn[s_ix] =
								c_ixc[s_ix] + 1;
							} else if (core_fcyclic){
								c_ixc[s_ix]++;
								c_ixn[s_ix] =
								c_ixc[s_ix];
							}
							if (sock_fcyclic)
								s_ix++;
						}
					} else {
						c_ixc[s_ix]++;
						if (c_ixc[s_ix] == cps)
							s_ix++;
					}
				} else {
					/* granularity is core or larger */
					c_ixc[s_ix]++;
					j++;
					if (i == ntskip)
						_add_hwloc_cpuset(hwtype,
							req_hwtype, obj, taskid,
						  	bind_verbose, cpuset);
					if ((j < npdist) && (sock_fcyclic))
						s_ix++;
				}
			} else
				s_ix++;
		}
		/* if it succeeds, switch to the next task, starting
		 * with the next available socket, otherwise, loop back
		 * from the first socket trying to find available slots. */
		if (j == npdist) {
			i++;
			j = 0;
			s_ix++; // no validity check, handled by the while
			sock_loop = 0;
		} else {
			sock_loop++;
			s_ix = 0;
		}
	}
	xfree(t_ix);
	xfree(c_ixc);
	xfree(c_ixn);

	if (spec_threads) {
		for (i = 0; i < npus; i++) {
			if (bit_test(spec_threads, i)) {
				hwloc_bitmap_clr(cpuset, i);
			}
		};
		FREE_NULL_BITMAP(spec_threads);
	}

	/* should never happen in normal scenario */
	if ((sock_loop > npdist) && !hwloc_success) {
		/* hwloc_get_obj_below_by_type() fails if no CPU set
		 * configured, see hwloc documentation for details */
		error("task/cgroup: hwloc_get_obj_below_by_type() failing, "
		      "task/affinity plugin may be required to address bug "
		      "fixed in HWLOC version 1.11.5");
		return XCGROUP_ERROR;
	} else if (sock_loop > npdist) {
		char buf[128] = "";
		hwloc_bitmap_snprintf(buf, sizeof(buf), cpuset);
		error("task/cgroup: task[%u] infinite loop broken while trying "
		      "to provision compute elements using %s (bitmap:%s)",
		      taskid, format_task_dist_states(job->task_dist), buf);
		return XCGROUP_ERROR;
	} else
		return XCGROUP_SUCCESS;
}