void affinity_init() { int numberOfDomains = 1; /* all systems have the node domain */ int currentDomain; int subCounter = 0; int offset = 0; int tmp; int numberOfSocketDomains = cpuid_topology.numSockets; int numberOfNumaDomains = numa_info.numberOfNodes; int numberOfProcessorsPerSocket = cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore; int numberOfCacheDomains; int numberOfCoresPerCache = cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads/ cpuid_topology.numThreadsPerCore; int numberOfProcessorsPerCache = cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads; int numberOfCoresPerNUMA = /* for the cache domain take only into account last level cache and assume * all sockets to be uniform. */ /* determine how many last level shared caches exist per socket */ numberOfCacheDomains = cpuid_topology.numSockets * (cpuid_topology.numCoresPerSocket/numberOfCoresPerCache); /* determine total number of domains */ numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains; domains = (AffinityDomain*) malloc(numberOfDomains * sizeof(AffinityDomain)); if (!domains) { fprintf(stderr,"No more memory for %ld bytes for array of affinity domains\n",numberOfDomains * sizeof(AffinityDomain)); return; } /* Node domain */ domains[0].numberOfProcessors = cpuid_topology.activeHWThreads; domains[0].numberOfCores = cpuid_topology.numSockets * cpuid_topology.numCoresPerSocket; domains[0].tag = bformat("N"); domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int)); if (!domains[0].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", cpuid_topology.numHWThreads*sizeof(int), bdata(domains[0].tag)); return; } offset = 0; if (numberOfSocketDomains > 1) { for (int i=0; i<numberOfSocketDomains; i++) { tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[0].processorList + offset, i, 0, numberOfProcessorsPerSocket); offset += tmp; } } else { tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[0].processorList, 0, 0, domains[0].numberOfProcessors); domains[0].numberOfProcessors = tmp; } /* Socket domains */ currentDomain = 1; for (int i=0; i < numberOfSocketDomains; i++ ) { domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket; domains[currentDomain + i].numberOfCores = cpuid_topology.numCoresPerSocket; domains[currentDomain + i].tag = bformat("S%d", i); domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int)); if (!domains[currentDomain + i].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", domains[currentDomain + i].numberOfProcessors * sizeof(int), bdata(domains[currentDomain + i].tag)); return; } tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[currentDomain + i].processorList, i, 0, domains[currentDomain + i].numberOfProcessors); domains[currentDomain + i].numberOfProcessors = tmp; } /* Cache domains */ currentDomain += numberOfSocketDomains; subCounter = 0; for (int i=0; i < numberOfSocketDomains; i++ ) { offset = 0; for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ ) { domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache; domains[currentDomain + subCounter].numberOfCores = numberOfCoresPerCache; domains[currentDomain + subCounter].tag = bformat("C%d", subCounter); domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int)); if (!domains[currentDomain + subCounter].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", numberOfProcessorsPerCache*sizeof(int), bdata(domains[currentDomain + subCounter].tag)); return; } tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[currentDomain + subCounter].processorList, i, offset, domains[currentDomain + subCounter].numberOfProcessors); domains[currentDomain + subCounter].numberOfProcessors = tmp; offset += (tmp < numberOfCoresPerCache ? tmp : numberOfCoresPerCache); subCounter++; } } /* Memory domains */ currentDomain += numberOfCacheDomains; subCounter = 0; if ((numberOfNumaDomains >= numberOfSocketDomains) && (numberOfNumaDomains > 1)) { for (int i=0; i < numberOfSocketDomains; i++ ) { offset = 0; for ( int j=0; j < (int)ceil((double)(numberOfNumaDomains)/numberOfSocketDomains); j++ ) { domains[currentDomain + subCounter].numberOfProcessors = numa_info.nodes[subCounter].numberOfProcessors; domains[currentDomain + subCounter].numberOfCores = numa_info.nodes[subCounter].numberOfProcessors/cpuid_topology.numThreadsPerCore; domains[currentDomain + subCounter].tag = bformat("M%d", subCounter); domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int)); if (!domains[currentDomain + subCounter].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", numa_info.nodes[subCounter].numberOfProcessors*sizeof(int), bdata(domains[currentDomain + subCounter].tag)); return; } tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[currentDomain + subCounter].processorList, i, offset, domains[currentDomain + subCounter].numberOfProcessors); domains[currentDomain + subCounter].numberOfProcessors = tmp; offset += domains[currentDomain + subCounter].numberOfCores; subCounter++; } } } else { offset = 0; int NUMAthreads = numberOfProcessorsPerSocket * numberOfSocketDomains; domains[currentDomain + subCounter].numberOfProcessors = NUMAthreads; domains[currentDomain + subCounter].numberOfCores = numberOfProcessorsPerSocket; domains[currentDomain + subCounter].tag = bformat("M%d", subCounter); domains[currentDomain + subCounter].processorList = (int*) malloc(NUMAthreads*sizeof(int)); if (!domains[currentDomain + subCounter].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", NUMAthreads*sizeof(int), bdata(domains[currentDomain + subCounter].tag)); return; } tmp = 0; for (int i=0; i < numberOfSocketDomains; i++ ) { tmp += treeFillNextEntries( cpuid_topology.topologyTree, &(domains[currentDomain + subCounter].processorList[offset]), i, 0, numberOfProcessorsPerSocket); offset += numberOfProcessorsPerSocket; } domains[currentDomain + subCounter].numberOfProcessors = tmp; } /* This is redundant ;-). Create thread to node lookup */ for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ ) { for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ ) { affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i; } } affinity_numberOfDomains = numberOfDomains; affinityDomains.numberOfAffinityDomains = numberOfDomains; affinityDomains.numberOfSocketDomains = numberOfSocketDomains; affinityDomains.numberOfNumaDomains = numberOfNumaDomains; affinityDomains.numberOfProcessorsPerSocket = numberOfProcessorsPerSocket; affinityDomains.numberOfCacheDomains = numberOfCacheDomains; affinityDomains.numberOfCoresPerCache = numberOfCoresPerCache; affinityDomains.numberOfProcessorsPerCache = numberOfProcessorsPerCache; affinityDomains.domains = domains; }
void affinity_init() { int numberOfDomains = 1; /* all systems have the node domain */ int currentDomain; int subCounter = 0; int offset = 0; int tmp; if (affinity_initialized == 1) { return; } topology_init(); int numberOfSocketDomains = cpuid_topology.numSockets; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Socket domains %d, numberOfSocketDomains); numa_init(); int numberOfNumaDomains = numa_info.numberOfNodes; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: NUMA domains %d, numberOfNumaDomains); int numberOfProcessorsPerSocket = cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPUs per socket %d, numberOfProcessorsPerSocket); int numberOfCacheDomains; int numberOfCoresPerCache = cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads/ cpuid_topology.numThreadsPerCore; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPU cores per LLC %d, numberOfCoresPerCache); int numberOfProcessorsPerCache = cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: CPUs per LLC %d, numberOfProcessorsPerCache); /* for the cache domain take only into account last level cache and assume * all sockets to be uniform. */ /* determine how many last level shared caches exist per socket */ numberOfCacheDomains = cpuid_topology.numSockets * (cpuid_topology.numCoresPerSocket/numberOfCoresPerCache); DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: Cache domains %d, numberOfCacheDomains); /* determine total number of domains */ numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity: All domains %d, numberOfDomains); domains = (AffinityDomain*) malloc(numberOfDomains * sizeof(AffinityDomain)); if (!domains) { fprintf(stderr,"No more memory for %ld bytes for array of affinity domains\n",numberOfDomains * sizeof(AffinityDomain)); return; } /* Node domain */ domains[0].numberOfProcessors = cpuid_topology.activeHWThreads; domains[0].numberOfCores = cpuid_topology.numSockets * cpuid_topology.numCoresPerSocket; DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain N: %d HW threads on %d cores, domains[0].numberOfProcessors, domains[0].numberOfCores); domains[0].tag = bformat("N"); domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int)); if (!domains[0].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", cpuid_topology.numHWThreads*sizeof(int), bdata(domains[0].tag)); return; } offset = 0; if (numberOfSocketDomains > 1) { for (int i=0; i<numberOfSocketDomains; i++) { tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[0].processorList + offset, i, 0, numberOfProcessorsPerSocket); offset += tmp; } } else { tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[0].processorList, 0, 0, domains[0].numberOfProcessors); domains[0].numberOfProcessors = tmp; } /* Socket domains */ currentDomain = 1; for (int i=0; i < numberOfSocketDomains; i++ ) { domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket; domains[currentDomain + i].numberOfCores = cpuid_topology.numCoresPerSocket; domains[currentDomain + i].tag = bformat("S%d", i); DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain S%d: %d HW threads on %d cores, i, domains[currentDomain + i].numberOfProcessors, domains[currentDomain + i].numberOfCores); domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int)); if (!domains[currentDomain + i].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", domains[currentDomain + i].numberOfProcessors * sizeof(int), bdata(domains[currentDomain + i].tag)); return; } tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[currentDomain + i].processorList, i, 0, domains[currentDomain + i].numberOfProcessors); for ( int j = 0; j < tmp; j++ ) { affinity_core2node_lookup[domains[currentDomain + i].processorList[j]] = i; } domains[currentDomain + i].numberOfProcessors = tmp; } /* Cache domains */ currentDomain += numberOfSocketDomains; subCounter = 0; for (int i=0; i < numberOfSocketDomains; i++ ) { offset = 0; for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ ) { domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache; domains[currentDomain + subCounter].numberOfCores = numberOfCoresPerCache; domains[currentDomain + subCounter].tag = bformat("C%d", subCounter); DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain C%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores); domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int)); if (!domains[currentDomain + subCounter].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", numberOfProcessorsPerCache*sizeof(int), bdata(domains[currentDomain + subCounter].tag)); return; } tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[currentDomain + subCounter].processorList, i, offset, domains[currentDomain + subCounter].numberOfProcessors); domains[currentDomain + subCounter].numberOfProcessors = tmp; offset += (tmp < numberOfCoresPerCache ? tmp : numberOfCoresPerCache); subCounter++; } } /* Memory domains */ currentDomain += numberOfCacheDomains; subCounter = 0; if ((numberOfNumaDomains >= numberOfSocketDomains) && (numberOfNumaDomains > 1)) { for (int i=0; i < numberOfSocketDomains; i++ ) { offset = 0; for ( int j=0; j < (int)ceil((double)(numberOfNumaDomains)/numberOfSocketDomains); j++ ) { domains[currentDomain + subCounter].numberOfProcessors = numa_info.nodes[subCounter].numberOfProcessors; domains[currentDomain + subCounter].numberOfCores = numa_info.nodes[subCounter].numberOfProcessors/cpuid_topology.numThreadsPerCore; domains[currentDomain + subCounter].tag = bformat("M%d", subCounter); DEBUG_PRINT(DEBUGLEV_DEVELOP, Affinity domain M%d: %d HW threads on %d cores, subCounter, domains[currentDomain + subCounter].numberOfProcessors, domains[currentDomain + subCounter].numberOfCores); domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int)); if (!domains[currentDomain + subCounter].processorList) { fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n", numa_info.nodes[subCounter].numberOfProcessors*sizeof(int), bdata(domains[currentDomain + subCounter].tag)); return; } tmp = treeFillNextEntries(cpuid_topology.topologyTree, domains[currentDomain + subCounter].processorList, i, offset, domains[currentDomain + subCounter].numberOfProcessors); domains[currentDomain + subCounter].numberOfProcessors = tmp; offset += domains[currentDomain + subCounter].numberOfCores; subCounter++; } } } else {