static int lua_likwid_pinProcess(lua_State* L) { int cpuID = luaL_checknumber(L,-2); int silent = luaL_checknumber(L,-1); luaL_argcheck(L, cpuID >= 0, 1, "CPU ID must be greater or equal 0"); if (affinity_isInitialized == 0) { affinity_init(); affinity_isInitialized = 1; affinity = get_affinityDomains(); } affinity_pinProcess(cpuID); if (!silent) { #ifdef COLOR color_on(BRIGHT, COLOR); #endif printf("[likwid-pin] Main PID -> core %d - OK", cpuID); #ifdef COLOR color_reset(); #endif printf("\n"); } return 0; }
static int cpustr_to_cpulist_scatter(bstring bcpustr, int* cpulist, int length) { topology_init(); CpuTopology_t cpuid_topology = get_cpuTopology(); affinity_init(); AffinityDomains_t affinity = get_affinityDomains(); char* cpustring = bstr2cstr(bcpustr, '\0'); if (bstrchrp(bcpustr, ':', 0) != BSTR_ERR) { int insert = 0; int suitidx = 0; int* suitable = (int*)malloc(affinity->numberOfAffinityDomains*sizeof(int)); if (!suitable) { bcstrfree(cpustring); return -ENOMEM; } for (int i=0; i<affinity->numberOfAffinityDomains; i++) { if (bstrchrp(affinity->domains[i].tag, cpustring[0], 0) != BSTR_ERR) { suitable[suitidx] = i; suitidx++; } } int* sortedList = (int*) malloc(affinity->domains[suitable[0]].numberOfProcessors * sizeof(int)); if (!sortedList) { free(suitable); bcstrfree(cpustring); return -ENOMEM; } for (int off=0;off<affinity->domains[suitable[0]].numberOfProcessors;off++) { for(int i=0;i < suitidx; i++) { cpulist_sort(affinity->domains[suitable[i]].processorList, sortedList, affinity->domains[suitable[i]].numberOfProcessors); cpulist[insert] = sortedList[off]; insert++; if (insert == length) goto scatter_done; } } scatter_done: bcstrfree(cpustring); free(sortedList); free(suitable); return insert; } bcstrfree(cpustring); return 0; }
static int cpu_in_domain(int domainidx, int cpu) { affinity_init(); AffinityDomains_t affinity = get_affinityDomains(); for (int i=0;i<affinity->domains[domainidx].numberOfProcessors; i++) { if (cpu == affinity->domains[domainidx].processorList[i]) { return 1; } } return 0; }
int test_perfmoninit() { int cpu = 0; int i; topology_init(); affinity_init(); for(i=0;i<10;i++) { perfmon_init(1, &cpu); perfmon_finalize(); } affinity_finalize(); topology_finalize(); return 1; }
int test_affinityinit() { int i = 0; topology_init(); CpuTopology_t cputopo = get_cpuTopology(); numa_init(); affinity_init(); AffinityDomains_t doms = get_affinityDomains(); if (doms == NULL) goto fail; if (doms->numberOfSocketDomains != cputopo->numSockets) goto fail; if (doms->numberOfNumaDomains == 0) goto fail; if (doms->numberOfProcessorsPerSocket == 0) goto fail; if (doms->numberOfAffinityDomains == 0) goto fail; if (doms->numberOfCacheDomains == 0) goto fail; if (doms->numberOfCoresPerCache == 0) goto fail; if (doms->numberOfProcessorsPerCache == 0) goto fail; if (doms->numberOfProcessorsPerCache < doms->numberOfCoresPerCache) goto fail; if (doms->domains == NULL) goto fail; for (i = 0; i < doms->numberOfAffinityDomains; i++) { if (doms->domains[i].numberOfProcessors == 0) goto fail; if (doms->domains[i].numberOfCores == 0) goto fail; if (doms->domains[i].numberOfProcessors < doms->domains[i].numberOfCores) goto fail; if (doms->domains[i].processorList == NULL) goto fail; } affinity_finalize(); topology_finalize(); return 1; fail: affinity_finalize(); topology_finalize(); return 0; }
int test_perfmoninit_valid() { int cpu = 0; topology_init(); affinity_init(); int ret = perfmon_init(1, &cpu); if (ret != 0) goto fail; if (perfmon_getNumberOfGroups() != 0) goto fail; if (perfmon_getNumberOfThreads() != 1) goto fail; perfmon_finalize(); affinity_finalize(); topology_finalize(); return 1; fail: perfmon_finalize(); affinity_finalize(); topology_finalize(); return 0; }
static int cpuexpr_to_list(bstring bcpustr, bstring prefix, int* list, int length) { topology_init(); CpuTopology_t cpuid_topology = get_cpuTopology(); affinity_init(); AffinityDomains_t affinity = get_affinityDomains(); struct bstrList* strlist = bstrListCreate(); strlist = bsplit(bcpustr, ','); int oldinsert = 0; int insert = 0; for (int i=0;i < strlist->qty; i++) { bstring newstr = bstrcpy(prefix); bconcat(newstr, strlist->entry[i]); oldinsert = insert; for (int j = 0; j < affinity->numberOfAffinityDomains; j++) { if (bstrcmp(affinity->domains[j].tag, newstr) == 0) { list[insert] = atoi(bdata(strlist->entry[i])); insert++; if (insert == length) goto list_done; break; } } if (insert == oldinsert) { fprintf(stderr,"Domain %s cannot be found\n", bdata(newstr)); } bdestroy(newstr); } list_done: bstrListDestroy(strlist); return insert; }
void likwid_markerInit(void) { int i; int verbosity; bstring bThreadStr; bstring bEventStr; struct bstrList* threadTokens; struct bstrList* eventStrings; char* modeStr = getenv("LIKWID_MODE"); char* eventStr = getenv("LIKWID_EVENTS"); char* cThreadStr = getenv("LIKWID_THREADS"); char* filepath = getenv("LIKWID_FILEPATH"); /* Dirty hack to avoid nonnull warnings */ int (*ownatoi)(const char*); ownatoi = &atoi; if ((modeStr != NULL) && (filepath != NULL) && (eventStr != NULL) && (cThreadStr != NULL)) { likwid_init = 1; } else if (likwid_init == 0) { fprintf(stderr, "Cannot initalize LIKWID marker API, environment variables are not set\n"); fprintf(stderr, "You have to set the -m commandline switch for likwid-perfctr\n"); return; } else { return; } if (!lock_check()) { fprintf(stderr,"Access to performance counters is locked.\n"); exit(EXIT_FAILURE); } topology_init(); numa_init(); affinity_init(); hashTable_init(); for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT; HPMmode(atoi(modeStr)); if (getenv("LIKWID_DEBUG") != NULL) { perfmon_verbosity = atoi(getenv("LIKWID_DEBUG")); verbosity = perfmon_verbosity; } bThreadStr = bfromcstr(cThreadStr); threadTokens = bstrListCreate(); threadTokens = bsplit(bThreadStr,','); num_cpus = threadTokens->qty; for (i=0; i<num_cpus; i++) { threads2Cpu[i] = ownatoi(bdata(threadTokens->entry[i])); } bdestroy(bThreadStr); bstrListDestroy(threadTokens); if (getenv("LIKWID_PIN") != NULL) { likwid_pinThread(threads2Cpu[0]); if (getenv("OMP_NUM_THREADS") != NULL) { if (ownatoi(getenv("OMP_NUM_THREADS")) > num_cpus) { use_locks = 1; } } if (getenv("CILK_NWORKERS") != NULL) { if (ownatoi(getenv("CILK_NWORKERS")) > num_cpus) { use_locks = 1; } } } i = perfmon_init(num_cpus, threads2Cpu); if (i<0) { fprintf(stderr,"Failed to initialize LIKWID perfmon library.\n"); return; } bEventStr = bfromcstr(eventStr); eventStrings = bstrListCreate(); eventStrings = bsplit(bEventStr,'|'); numberOfGroups = eventStrings->qty; groups = malloc(numberOfGroups * sizeof(int)); if (!groups) { fprintf(stderr,"Cannot allocate space for group handling.\n"); bstrListDestroy(eventStrings); exit(EXIT_FAILURE); } for (i=0; i<eventStrings->qty; i++) { groups[i] = perfmon_addEventSet(bdata(eventStrings->entry[i])); } bstrListDestroy(eventStrings); bdestroy(bEventStr); for (i=0; i<num_cpus; i++) { hashTable_initThread(threads2Cpu[i]); for(int j=0; j<groupSet->groups[groups[0]].numberOfEvents;j++) { groupSet->groups[groups[0]].events[j].threadCounter[i].init = TRUE; } } groupSet->activeGroup = 0; }
static int lua_likwid_getAffinityInfo(lua_State* L) { int i,j; if (topology_isInitialized == 0) { topology_init(); topology_isInitialized = 1; cpuinfo = get_cpuInfo(); cputopo = get_cpuTopology(); } if ((topology_isInitialized) && (cpuinfo == NULL)) { cpuinfo = get_cpuInfo(); } if ((topology_isInitialized) && (cputopo == NULL)) { cputopo = get_cpuTopology(); } if (numa_isInitialized == 0) { if (numa_init() == 0) { numa_isInitialized = 1; numainfo = get_numaTopology(); } } if ((numa_isInitialized) && (numainfo == NULL)) { numainfo = get_numaTopology(); } if (affinity_isInitialized == 0) { affinity_init(); affinity_isInitialized = 1; affinity = get_affinityDomains(); } if ((affinity_isInitialized) && (affinity == NULL)) { affinity = get_affinityDomains(); } if (!affinity) { lua_pushstring(L,"Cannot initialize affinity groups"); lua_error(L); } lua_newtable(L); lua_pushstring(L,"numberOfAffinityDomains"); lua_pushunsigned(L,affinity->numberOfAffinityDomains); lua_settable(L,-3); lua_pushstring(L,"numberOfSocketDomains"); lua_pushunsigned(L,affinity->numberOfSocketDomains); lua_settable(L,-3); lua_pushstring(L,"numberOfNumaDomains"); lua_pushunsigned(L,affinity->numberOfNumaDomains); lua_settable(L,-3); lua_pushstring(L,"numberOfProcessorsPerSocket"); lua_pushunsigned(L,affinity->numberOfProcessorsPerSocket); lua_settable(L,-3); lua_pushstring(L,"numberOfCacheDomains"); lua_pushunsigned(L,affinity->numberOfCacheDomains); lua_settable(L,-3); lua_pushstring(L,"numberOfCoresPerCache"); lua_pushunsigned(L,affinity->numberOfCoresPerCache); lua_settable(L,-3); lua_pushstring(L,"numberOfProcessorsPerCache"); lua_pushunsigned(L,affinity->numberOfProcessorsPerCache); lua_settable(L,-3); lua_pushstring(L,"domains"); lua_newtable(L); for(i=0;i<affinity->numberOfAffinityDomains;i++) { lua_pushunsigned(L, i+1); lua_newtable(L); lua_pushstring(L,"tag"); lua_pushstring(L,bdata(affinity->domains[i].tag)); lua_settable(L,-3); lua_pushstring(L,"numberOfProcessors"); lua_pushunsigned(L,affinity->domains[i].numberOfProcessors); lua_settable(L,-3); lua_pushstring(L,"numberOfCores"); lua_pushunsigned(L,affinity->domains[i].numberOfCores); lua_settable(L,-3); lua_pushstring(L,"processorList"); lua_newtable(L); for(j=0;j<affinity->domains[i].numberOfProcessors;j++) { lua_pushunsigned(L,j+1); lua_pushunsigned(L,affinity->domains[i].processorList[j]); lua_settable(L,-3); } lua_settable(L,-3); lua_settable(L,-3); } lua_settable(L,-3); return 1; }
static int lua_likwid_getNumaInfo(lua_State* L) { uint32_t i,j; if (topology_isInitialized == 0) { topology_init(); topology_isInitialized = 1; cpuinfo = get_cpuInfo(); cputopo = get_cpuTopology(); } if ((topology_isInitialized) && (cpuinfo == NULL)) { cpuinfo = get_cpuInfo(); } if ((topology_isInitialized) && (cputopo == NULL)) { cputopo = get_cpuTopology(); } if (numa_isInitialized == 0) { if (numa_init() == 0) { numa_isInitialized = 1; numainfo = get_numaTopology(); } else { lua_newtable(L); lua_pushstring(L,"numberOfNodes"); lua_pushunsigned(L,0); lua_settable(L,-3); lua_pushstring(L,"nodes"); lua_newtable(L); lua_settable(L,-3); return 1; } } if ((numa_isInitialized) && (numainfo == NULL)) { numainfo = get_numaTopology(); } if (affinity_isInitialized == 0) { affinity_init(); affinity_isInitialized = 1; affinity = get_affinityDomains(); } if ((affinity_isInitialized) && (affinity == NULL)) { affinity = get_affinityDomains(); } lua_newtable(L); lua_pushstring(L,"numberOfNodes"); lua_pushunsigned(L,numainfo->numberOfNodes); lua_settable(L,-3); lua_pushstring(L,"nodes"); lua_newtable(L); for(i=0;i<numainfo->numberOfNodes;i++) { lua_pushinteger(L, i+1); lua_newtable(L); lua_pushstring(L,"id"); lua_pushunsigned(L,numainfo->nodes[i].id); lua_settable(L,-3); lua_pushstring(L,"totalMemory"); lua_pushunsigned(L,numainfo->nodes[i].totalMemory); lua_settable(L,-3); lua_pushstring(L,"freeMemory"); lua_pushunsigned(L,numainfo->nodes[i].freeMemory); lua_settable(L,-3); lua_pushstring(L,"numberOfProcessors"); lua_pushunsigned(L,numainfo->nodes[i].numberOfProcessors); lua_settable(L,-3); lua_pushstring(L,"numberOfDistances"); lua_pushunsigned(L,numainfo->nodes[i].numberOfDistances); lua_settable(L,-3); lua_pushstring(L,"processors"); lua_newtable(L); for(j=0;j<numainfo->nodes[i].numberOfProcessors;j++) { lua_pushunsigned(L,j+1); lua_pushunsigned(L,numainfo->nodes[i].processors[j]); lua_settable(L,-3); } lua_settable(L,-3); /*lua_pushstring(L,"processorsCompact"); lua_newtable(L); for(j=0;j<numa->nodes[i].numberOfProcessors;j++) { lua_pushunsigned(L,j); lua_pushunsigned(L,numa->nodes[i].processorsCompact[j]); lua_settable(L,-3); } lua_settable(L,-3);*/ lua_pushstring(L,"distances"); lua_newtable(L); for(j=0;j<numainfo->nodes[i].numberOfDistances;j++) { lua_pushinteger(L,j+1); lua_newtable(L); lua_pushinteger(L,j); lua_pushunsigned(L,numainfo->nodes[i].distances[j]); lua_settable(L,-3); lua_settable(L,-3); } lua_settable(L,-3); lua_settable(L,-3); } lua_settable(L,-3); return 1; }
int main (int argc, char** argv) { int socket_fd = -1; int optInfo = 0; int optClock = 0; int optStethoscope = 0; int optSockets = 0; double runtime; int hasDRAM = 0; int c; bstring argString; bstring eventString = bfromcstr("CLOCK"); int numSockets=1; int numThreads=0; int threadsSockets[MAX_NUM_NODES*2]; int threads[MAX_NUM_THREADS]; threadsSockets[0] = 0; if (argc == 1) { HELP_MSG; exit (EXIT_SUCCESS); } while ((c = getopt (argc, argv, "+c:hiM:ps:v")) != -1) { switch (c) { case 'c': CHECK_OPTION_STRING; numSockets = bstr_to_cpuset_physical((uint32_t*) threadsSockets, argString); bdestroy(argString); optSockets = 1; break; case 'h': HELP_MSG; exit (EXIT_SUCCESS); case 'i': optInfo = 1; break; case 'M': /* Set MSR Access mode */ CHECK_OPTION_STRING; accessClient_setaccessmode(str2int((char*) argString->data)); bdestroy(argString); break; case 'p': optClock = 1; break; case 's': CHECK_OPTION_STRING; optStethoscope = str2int((char*) argString->data); bdestroy(argString); break; case 'v': VERSION_MSG; exit (EXIT_SUCCESS); case '?': if (optopt == 's' || optopt == 'M' || optopt == 'c') { HELP_MSG; } else if (isprint (optopt)) { fprintf (stderr, "Unknown option `-%c'.\n", optopt); } else { fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); } exit( EXIT_FAILURE); default: HELP_MSG; exit (EXIT_SUCCESS); } } if (!lock_check()) { fprintf(stderr,"Access to performance counters is locked.\n"); exit(EXIT_FAILURE); } if (optClock && optind == argc) { fprintf(stderr,"Commandline option -p requires an executable.\n"); exit(EXIT_FAILURE); } if (optSockets && !optStethoscope && optind == argc) { fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n"); exit(EXIT_FAILURE); } if (cpuid_init() == EXIT_FAILURE) { fprintf(stderr, "CPU not supported\n"); exit(EXIT_FAILURE); } if (numSockets > cpuid_topology.numSockets) { fprintf(stderr, "System has only %d sockets but %d are given on commandline\n", cpuid_topology.numSockets, numSockets); exit(EXIT_FAILURE); } numa_init(); /* consider NUMA node as power unit for the moment */ accessClient_init(&socket_fd); msr_init(socket_fd); timer_init(); /* check for supported processors */ if ((cpuid_info.model == SANDYBRIDGE_EP) || (cpuid_info.model == SANDYBRIDGE) || (cpuid_info.model == IVYBRIDGE) || (cpuid_info.model == IVYBRIDGE_EP) || (cpuid_info.model == HASWELL) || (cpuid_info.model == NEHALEM_BLOOMFIELD) || (cpuid_info.model == NEHALEM_LYNNFIELD) || (cpuid_info.model == NEHALEM_WESTMERE)) { power_init(numa_info.nodes[0].processors[0]); } else { fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell processors!\n"); exit(EXIT_FAILURE); } double clock = (double) timer_getCpuClock(); printf(HLINE); printf("CPU name:\t%s \n",cpuid_info.name); printf("CPU clock:\t%3.2f GHz \n", (float) clock * 1.E-09); printf(HLINE); if (optInfo) { if (power_info.turbo.numSteps != 0) { printf("Base clock:\t%.2f MHz \n", power_info.baseFrequency ); printf("Minimal clock:\t%.2f MHz \n", power_info.minFrequency ); printf("Turbo Boost Steps:\n"); for (int i=0; i < power_info.turbo.numSteps; i++ ) { printf("C%d %.2f MHz \n",i+1, power_info.turbo.steps[i] ); } } printf(HLINE); } if (cpuid_info.model == SANDYBRIDGE_EP) { hasDRAM = 1; } else if ((cpuid_info.model != SANDYBRIDGE) && (cpuid_info.model != SANDYBRIDGE_EP) && (cpuid_info.model != IVYBRIDGE) && (cpuid_info.model != IVYBRIDGE_EP) && (cpuid_info.model != HASWELL)) { fprintf (stderr, "RAPL not supported on this processor!\n"); exit(EXIT_FAILURE); } if (optInfo) { printf("Thermal Spec Power: %g Watts \n", power_info.tdp ); printf("Minimum Power: %g Watts \n", power_info.minPower); printf("Maximum Power: %g Watts \n", power_info.maxPower); printf("Maximum Time Window: %g micro sec \n", power_info.maxTimeWindow); printf(HLINE); exit(EXIT_SUCCESS); } if (optClock) { affinity_init(); argString = bformat("S%u:0-%u", threadsSockets[0], cpuid_topology.numCoresPerSocket-1); for (int i=1; i<numSockets; i++) { bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i], cpuid_topology.numCoresPerSocket-1); bconcat(argString, tExpr); } numThreads = bstr_to_cpuset(threads, argString); bdestroy(argString); perfmon_init(numThreads, threads, stdout); perfmon_setupEventSet(eventString, NULL); } { PowerData pDataPkg[MAX_NUM_NODES*2]; PowerData pDataDram[MAX_NUM_NODES*2]; printf("Measure on sockets: %d", threadsSockets[0]); for (int i=1; i<numSockets; i++) { printf(", %d", threadsSockets[i]); } printf("\n"); if (optStethoscope) { if (optClock) { perfmon_startCounters(); } else { for (int i=0; i<numSockets; i++) { int cpuId = numa_info.nodes[threadsSockets[i]].processors[0]; if (hasDRAM) power_start(pDataDram+i, cpuId, DRAM); power_start(pDataPkg+i, cpuId, PKG); } } sleep(optStethoscope); if (optClock) { perfmon_stopCounters(); perfmon_printCounterResults(); perfmon_finalize(); } else { for (int i=0; i<numSockets; i++) { int cpuId = numa_info.nodes[threadsSockets[i]].processors[0]; power_stop(pDataPkg+i, cpuId, PKG); if (hasDRAM) power_stop(pDataDram+i, cpuId, DRAM); } } runtime = (double) optStethoscope; } else { TimerData time; argv += optind; bstring exeString = bfromcstr(argv[0]); for (int i=1; i<(argc-optind); i++) { bconchar(exeString, ' '); bcatcstr(exeString, argv[i]); } printf("%s\n",bdata(exeString)); if (optClock) { perfmon_startCounters(); } else { for (int i=0; i<numSockets; i++) { int cpuId = numa_info.nodes[threadsSockets[i]].processors[0]; if (hasDRAM) power_start(pDataDram+i, cpuId, DRAM); power_start(pDataPkg+i, cpuId, PKG); } timer_start(&time); } if (system(bdata(exeString)) == EOF) { fprintf(stderr, "Failed to execute %s!\n", bdata(exeString)); exit(EXIT_FAILURE); } if (optClock) { perfmon_stopCounters(); perfmon_printCounterResults(); perfmon_finalize(); } else { timer_stop(&time); for (int i=0; i<numSockets; i++) { int cpuId = numa_info.nodes[threadsSockets[i]].processors[0]; power_stop(pDataPkg+i, cpuId, PKG); if (hasDRAM) power_stop(pDataDram+i, cpuId, DRAM); } runtime = timer_print(&time); } } if (!optClock) { printf("Runtime: %g second \n",runtime); printf(HLINE); for (int i=0; i<numSockets; i++) { printf("Socket %d\n",threadsSockets[i]); printf("Domain: PKG \n"); printf("Energy consumed: %g Joules \n", power_printEnergy(pDataPkg+i)); printf("Power consumed: %g Watts \n", power_printEnergy(pDataPkg+i) / runtime ); if (hasDRAM) { printf("Domain: DRAM \n"); printf("Energy consumed: %g Joules \n", power_printEnergy(pDataDram+i)); printf("Power consumed: %g Watts \n", power_printEnergy(pDataDram+i) / runtime ); } printf("\n"); } } } #if 0 if ( cpuid_hasFeature(TM2) ) { thermal_init(0); printf("Current core temperatures:\n"); for (uint32_t i = 0; i < cpuid_topology.numCoresPerSocket; i++ ) { printf("Core %d: %u C\n", numa_info.nodes[socketId].processors[i], thermal_read(numa_info.nodes[socketId].processors[i])); } } #endif msr_finalize(); return EXIT_SUCCESS; }
void likwid_markerInit(void) { int cpuId = likwid_getProcessorId(); char* modeStr = getenv("LIKWID_MODE"); char* maskStr = getenv("LIKWID_MASK"); if ((modeStr != NULL) && (maskStr != NULL)) { likwid_init = 1; } else { return; } if (!lock_check()) { fprintf(stderr,"Access to performance counters is locked.\n"); exit(EXIT_FAILURE); } cpuid_init(); numa_init(); affinity_init(); timer_init(); hashTable_init(); for(int i=0; i<MAX_NUM_THREADS; i++) thread_socketFD[i] = -1; for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT; accessClient_mode = atoi(modeStr); str2BitMask(maskStr, &counterMask); if (accessClient_mode != DAEMON_AM_DIRECT) { accessClient_init(&thread_socketFD[cpuId]); } msr_init(thread_socketFD[cpuId]); thermal_init(cpuId); switch ( cpuid_info.family ) { case P6_FAMILY: switch ( cpuid_info.model ) { case PENTIUM_M_BANIAS: case PENTIUM_M_DOTHAN: perfmon_counter_map = pm_counter_map; perfmon_numCounters = NUM_COUNTERS_PM; perfmon_numCountersCore = NUM_COUNTERS_CORE_PM; break; case ATOM_45: case ATOM_32: case ATOM_22: case ATOM: perfmon_counter_map = core2_counter_map; perfmon_numCounters = NUM_COUNTERS_CORE2; perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2; break; case CORE_DUO: ERROR_PLAIN_PRINT(Unsupported Processor); break; case XEON_MP: case CORE2_65: case CORE2_45: perfmon_counter_map = core2_counter_map; perfmon_numCounters = NUM_COUNTERS_CORE2; perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2; break; case NEHALEM_EX: case WESTMERE_EX: perfmon_counter_map = westmereEX_counter_map; perfmon_numCounters = NUM_COUNTERS_WESTMEREEX; perfmon_numCountersCore = NUM_COUNTERS_CORE_WESTMEREEX; perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_WESTMEREEX; break; case NEHALEM_BLOOMFIELD: case NEHALEM_LYNNFIELD: case NEHALEM_WESTMERE_M: case NEHALEM_WESTMERE: perfmon_counter_map = nehalem_counter_map; perfmon_numCounters = NUM_COUNTERS_NEHALEM; perfmon_numCountersCore = NUM_COUNTERS_CORE_NEHALEM; perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_NEHALEM; break; case IVYBRIDGE: case IVYBRIDGE_EP: { int socket_fd = thread_socketFD[cpuId]; hasPCICounters = 1; power_init(0); /* FIXME Static coreId is dangerous */ pci_init(socket_fd); perfmon_counter_map = ivybridge_counter_map; perfmon_numCounters = NUM_COUNTERS_IVYBRIDGE; perfmon_numCountersCore = NUM_COUNTERS_CORE_IVYBRIDGE; perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_IVYBRIDGE; } break; case HASWELL: case HASWELL_EX: case HASWELL_M1: case HASWELL_M2: power_init(0); /* FIXME Static coreId is dangerous */ perfmon_counter_map = haswell_counter_map; perfmon_numCounters = NUM_COUNTERS_HASWELL; perfmon_numCountersCore = NUM_COUNTERS_CORE_HASWELL; break; case SANDYBRIDGE: case SANDYBRIDGE_EP: { int socket_fd = thread_socketFD[cpuId]; hasPCICounters = 1; power_init(0); /* FIXME Static coreId is dangerous */ pci_init(socket_fd); perfmon_counter_map = sandybridge_counter_map; perfmon_numCounters = NUM_COUNTERS_SANDYBRIDGE; perfmon_numCountersCore = NUM_COUNTERS_CORE_SANDYBRIDGE; perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_SANDYBRIDGE; } break; default: ERROR_PLAIN_PRINT(Unsupported Processor); break; } break; case MIC_FAMILY: switch ( cpuid_info.model ) { case XEON_PHI: perfmon_counter_map = phi_counter_map; perfmon_numCounters = NUM_COUNTERS_PHI; perfmon_numCountersCore = NUM_COUNTERS_CORE_PHI; break; default: ERROR_PLAIN_PRINT(Unsupported Processor); break; } break; case K8_FAMILY: perfmon_counter_map = k10_counter_map; perfmon_numCounters = NUM_COUNTERS_K10; perfmon_numCountersCore = NUM_COUNTERS_CORE_K10; break; case K10_FAMILY: perfmon_counter_map = k10_counter_map; perfmon_numCounters = NUM_COUNTERS_K10; perfmon_numCountersCore = NUM_COUNTERS_CORE_K10; break; case K15_FAMILY: perfmon_counter_map = interlagos_counter_map; perfmon_numCounters = NUM_COUNTERS_INTERLAGOS; perfmon_numCountersCore = NUM_COUNTERS_CORE_INTERLAGOS; break; case K16_FAMILY: perfmon_counter_map = kabini_counter_map; perfmon_numCounters = NUM_COUNTERS_KABINI; perfmon_numCountersCore = NUM_COUNTERS_CORE_KABINI; break; default: ERROR_PLAIN_PRINT(Unsupported Processor); break; } }
void likwid_markerInit(void) { int i; int verbosity; int setinit = 0; bstring bThreadStr; bstring bEventStr; struct bstrList* threadTokens; struct bstrList* eventStrings; char* modeStr = getenv("LIKWID_MODE"); char* eventStr = getenv("LIKWID_EVENTS"); char* cThreadStr = getenv("LIKWID_THREADS"); char* filepath = getenv("LIKWID_FILEPATH"); char* perfpid = getenv("LIKWID_PERF_EXECPID"); char execpid[20]; /* Dirty hack to avoid nonnull warnings */ int (*ownatoi)(const char*); ownatoi = &atoi; if ((modeStr != NULL) && (filepath != NULL) && (eventStr != NULL) && (cThreadStr != NULL) && likwid_init == 0) { setinit = 1; } else if (likwid_init == 0) { fprintf(stderr, "Running without Marker API. Activate Marker API with -m on commandline.\n"); return; } else { return; } if (!lock_check()) { fprintf(stderr,"Access to performance counters is locked.\n"); exit(EXIT_FAILURE); } topology_init(); numa_init(); affinity_init(); hashTable_init(); //#ifndef LIKWID_USE_PERFEVENT HPMmode(atoi(modeStr)); //#endif if (getenv("LIKWID_DEBUG") != NULL) { perfmon_verbosity = atoi(getenv("LIKWID_DEBUG")); verbosity = perfmon_verbosity; } bThreadStr = bfromcstr(cThreadStr); threadTokens = bsplit(bThreadStr,','); num_cpus = threadTokens->qty; for (i=0; i<num_cpus; i++) { threads2Cpu[i] = ownatoi(bdata(threadTokens->entry[i])); } bdestroy(bThreadStr); bstrListDestroy(threadTokens); if (getenv("LIKWID_PIN") != NULL) { likwid_pinThread(threads2Cpu[0]); if (getenv("OMP_NUM_THREADS") != NULL) { if (ownatoi(getenv("OMP_NUM_THREADS")) > num_cpus) { use_locks = 1; } } if (getenv("CILK_NWORKERS") != NULL) { if (ownatoi(getenv("CILK_NWORKERS")) > num_cpus) { use_locks = 1; } } } #ifdef LIKWID_USE_PERFEVENT if (perfpid != NULL) { snprintf(execpid, 19, "%d", getpid()); setenv("LIKWID_PERF_PID", execpid, 1); char* perfflags = getenv("LIKWID_PERF_FLAGS"); if (perfflags) { setenv("LIKWID_PERF_FLAGS", getenv("LIKWID_PERF_FLAGS"), 1); } } #endif i = perfmon_init(num_cpus, threads2Cpu); if (i<0) { //fprintf(stderr,"Failed to initialize LIKWID perfmon library.\n"); return; } bEventStr = bfromcstr(eventStr); eventStrings = bsplit(bEventStr,'|'); numberOfGroups = eventStrings->qty; groups = malloc(numberOfGroups * sizeof(int)); if (!groups) { fprintf(stderr,"Cannot allocate space for group handling.\n"); bstrListDestroy(eventStrings); exit(EXIT_FAILURE); } for (i=0; i<eventStrings->qty; i++) { groups[i] = perfmon_addEventSet(bdata(eventStrings->entry[i])); } bstrListDestroy(eventStrings); bdestroy(bEventStr); for (i=0; i<num_cpus; i++) { hashTable_initThread(threads2Cpu[i]); for(int j=0; j<groupSet->groups[groups[0]].numberOfEvents;j++) { groupSet->groups[groups[0]].events[j].threadCounter[i].init = TRUE; groupSet->groups[groups[0]].state = STATE_START; } } if (setinit) { likwid_init = 1; } threads2Pthread[registered_cpus] = pthread_self(); registered_cpus++; groupSet->activeGroup = 0; perfmon_setupCounters(groupSet->activeGroup); perfmon_startCounters(); }
int main(int argc, char** argv) { uint64_t iter = 100; uint32_t i; uint32_t j; int globalNumberOfThreads = 0; int optPrintDomains = 0; int c; ThreadUserData myData; bstring testcase = bfromcstr("none"); uint64_t numberOfWorkgroups = 0; int tmp = 0; double time; double cycPerUp = 0.0; const TestCase* test = NULL; uint64_t realSize = 0; uint64_t realIter = 0; uint64_t maxCycles = 0; uint64_t minCycles = UINT64_MAX; uint64_t cyclesClock = 0; uint64_t demandIter = 0; TimerData itertime; Workgroup* currentWorkgroup = NULL; Workgroup* groups = NULL; uint32_t min_runtime = 1; /* 1s */ bstring HLINE = bfromcstr(""); binsertch(HLINE, 0, 80, '-'); binsertch(HLINE, 80, 1, '\n'); int (*ownprintf)(const char *format, ...); ownprintf = &printf; /* Handling of command line options */ if (argc == 1) { HELP_MSG; exit(EXIT_SUCCESS); } while ((c = getopt (argc, argv, "w:t:s:l:aphvi:")) != -1) { switch (c) { case 'h': HELP_MSG; exit (EXIT_SUCCESS); case 'v': VERSION_MSG; exit (EXIT_SUCCESS); case 'a': ownprintf(TESTS"\n"); exit (EXIT_SUCCESS); case 'w': numberOfWorkgroups++; break; case 's': min_runtime = atoi(optarg); break; case 'i': demandIter = strtoul(optarg, NULL, 10); if (demandIter <= 0) { fprintf (stderr, "Error: Iterations must be greater than 0\n"); return EXIT_FAILURE; } break; case 'l': bdestroy(testcase); testcase = bfromcstr(optarg); for (i=0; i<NUMKERNELS; i++) { if (biseqcstr(testcase, kernels[i].name)) { test = kernels+i; break; } } if (test == NULL) { fprintf (stderr, "Error: Unknown test case %s\n",optarg); return EXIT_FAILURE; } else { ownprintf("Name: %s\n",test->name); ownprintf("Number of streams: %d\n",test->streams); ownprintf("Loop stride: %d\n",test->stride); ownprintf("Flops: %d\n",test->flops); ownprintf("Bytes: %d\n",test->bytes); switch (test->type) { case INT: ownprintf("Data Type: Integer\n"); break; case SINGLE: ownprintf("Data Type: Single precision float\n"); break; case DOUBLE: ownprintf("Data Type: Double precision float\n"); break; } if (test->loads >= 0) { ownprintf("Load Ops: %d\n",test->loads); } if (test->stores >= 0) { ownprintf("Store Ops: %d\n",test->stores); } if (test->branches >= 0) { ownprintf("Branches: %d\n",test->branches); } if (test->instr_const >= 0) { ownprintf("Constant instructions: %d\n",test->instr_const); } if (test->instr_loop >= 0) { ownprintf("Loop instructions: %d\n",test->instr_loop); } } bdestroy(testcase); exit (EXIT_SUCCESS); break; case 'p': optPrintDomains = 1; break; case 'g': numberOfWorkgroups = LLU_CAST atol(optarg); tmp = numberOfWorkgroups; break; case 't': bdestroy(testcase); testcase = bfromcstr(optarg); for (i=0; i<NUMKERNELS; i++) { if (biseqcstr(testcase, kernels[i].name)) { test = kernels+i; break; } } if (test == NULL) { fprintf (stderr, "Error: Unknown test case %s\n",optarg); return EXIT_FAILURE; } bdestroy(testcase); break; case '?': if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return EXIT_FAILURE; default: HELP_MSG; } } if ((numberOfWorkgroups == 0) && (!optPrintDomains)) { fprintf(stderr, "Error: At least one workgroup (-w) must be set on commandline\n"); exit (EXIT_FAILURE); } if (topology_init() != EXIT_SUCCESS) { fprintf(stderr, "Error: Unsupported processor!\n"); exit(EXIT_FAILURE); } if ((test == NULL) && (!optPrintDomains)) { fprintf(stderr, "Unknown test case. Please check likwid-bench -a for available tests\n"); fprintf(stderr, "and select one using the -t commandline option\n"); exit(EXIT_FAILURE); } numa_init(); affinity_init(); timer_init(); if (optPrintDomains) { bdestroy(testcase); AffinityDomains_t affinity = get_affinityDomains(); ownprintf("Number of Domains %d\n",affinity->numberOfAffinityDomains); for (i=0; i < affinity->numberOfAffinityDomains; i++ ) { ownprintf("Domain %d:\n",i); ownprintf("\tTag %s:",bdata(affinity->domains[i].tag)); for ( uint32_t j=0; j < affinity->domains[i].numberOfProcessors; j++ ) { ownprintf(" %d",affinity->domains[i].processorList[j]); } ownprintf("\n"); } exit (EXIT_SUCCESS); } allocator_init(numberOfWorkgroups * MAX_STREAMS); groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup)); tmp = 0; optind = 0; while ((c = getopt (argc, argv, "w:t:s:l:i:aphv")) != -1) { switch (c) { case 'w': currentWorkgroup = groups+tmp; bstring groupstr = bfromcstr(optarg); i = bstr_to_workgroup(currentWorkgroup, groupstr, test->type, test->streams); bdestroy(groupstr); if (i == 0) { for (i=0; i< test->streams; i++) { if (currentWorkgroup->streams[i].offset%test->stride) { fprintf (stderr, "Error: Stream %d: offset is not a multiple of stride!\n",i); return EXIT_FAILURE; } allocator_allocateVector(&(currentWorkgroup->streams[i].ptr), PAGE_ALIGNMENT, currentWorkgroup->size, currentWorkgroup->streams[i].offset, test->type, currentWorkgroup->streams[i].domain); } tmp++; } else { exit(EXIT_FAILURE); } break; default: continue; break; } } /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread * module only allows equally sized thread groups*/ for (i=0; i<numberOfWorkgroups; i++) { globalNumberOfThreads += groups[i].numberOfThreads; } ownprintf(bdata(HLINE)); ownprintf("LIKWID MICRO BENCHMARK\n"); ownprintf("Test: %s\n",test->name); ownprintf(bdata(HLINE)); ownprintf("Using %" PRIu64 " work groups\n",numberOfWorkgroups); ownprintf("Using %d threads\n",globalNumberOfThreads); ownprintf(bdata(HLINE)); threads_init(globalNumberOfThreads); threads_createGroups(numberOfWorkgroups); /* we configure global barriers only */ barrier_init(1); barrier_registerGroup(globalNumberOfThreads); cyclesClock = timer_getCycleClock(); #ifdef LIKWID_PERFMON if (getenv("LIKWID_FILEPATH") != NULL) { ownprintf("Using Likwid Marker API\n"); } LIKWID_MARKER_INIT; ownprintf(bdata(HLINE)); #endif /* initialize data structures for threads */ for (i=0; i<numberOfWorkgroups; i++) { myData.iter = iter; if (demandIter > 0) { myData.iter = demandIter; } myData.min_runtime = min_runtime; myData.size = groups[i].size; myData.test = test; myData.cycles = 0; myData.numberOfThreads = groups[i].numberOfThreads; myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int)); myData.streams = (void**) malloc(test->streams * sizeof(void*)); for (j=0; j<groups[i].numberOfThreads; j++) { myData.processors[j] = groups[i].processorIds[j]; } for (j=0; j< test->streams; j++) { myData.streams[j] = groups[i].streams[j].ptr; } threads_registerDataGroup(i, &myData, copyThreadData); free(myData.processors); free(myData.streams); } if (demandIter == 0) { getIterSingle((void*) &threads_data[0]); for (i=0; i<numberOfWorkgroups; i++) { iter = threads_updateIterations(i, demandIter); } } #ifdef DEBUG_LIKWID else { ownprintf("Using manually selected iterations per thread\n"); } #endif threads_create(runTest); threads_join(); for (int i=0; i<globalNumberOfThreads; i++) { realSize += threads_data[i].data.size; realIter += threads_data[i].data.iter; if (threads_data[i].cycles > maxCycles) { maxCycles = threads_data[i].cycles; } if (threads_data[i].cycles < minCycles) { minCycles = threads_data[i].cycles; } } time = (double) maxCycles / (double) cyclesClock; ownprintf(bdata(HLINE)); ownprintf("Cycles:\t\t\t%" PRIu64 "\n", maxCycles); ownprintf("CPU Clock:\t\t%" PRIu64 "\n", timer_getCpuClock()); ownprintf("Cycle Clock:\t\t%" PRIu64 "\n", cyclesClock); ownprintf("Time:\t\t\t%e sec\n", time); ownprintf("Iterations:\t\t%" PRIu64 "\n", realIter); ownprintf("Iterations per thread:\t%" PRIu64 "\n",threads_data[0].data.iter); ownprintf("Inner loop executions:\t%.0f\n", ((double)realSize)/((double)test->stride)); ownprintf("Size:\t\t\t%" PRIu64 "\n", realSize*test->bytes ); ownprintf("Size per thread:\t%" PRIu64 "\n", threads_data[0].data.size*test->bytes); ownprintf("Number of Flops:\t%" PRIu64 "\n", (threads_data[0].data.iter * realSize * test->flops)); ownprintf("MFlops/s:\t\t%.2f\n", 1.0E-06 * ((double) threads_data[0].data.iter * realSize * test->flops/ time)); ownprintf("Data volume (Byte):\t%llu\n", LLU_CAST (threads_data[0].data.iter * realSize * test->bytes)); ownprintf("MByte/s:\t\t%.2f\n", 1.0E-06 * ( (double) threads_data[0].data.iter * realSize * test->bytes/ time)); cycPerUp = ((double) maxCycles / (double) (threads_data[0].data.iter * realSize)); ownprintf("Cycles per update:\t%f\n", cycPerUp); switch ( test->type ) { case INT: case SINGLE: ownprintf("Cycles per cacheline:\t%f\n", (16.0 * cycPerUp)); break; case DOUBLE: ownprintf("Cycles per cacheline:\t%f\n", (8.0 * cycPerUp)); break; } ownprintf("Loads per update:\t%ld\n", test->loads ); ownprintf("Stores per update:\t%ld\n", test->stores ); if ((test->loads > 0) && (test->stores > 0)) { ownprintf("Load/store ratio:\t%.2f\n", ((double)test->loads)/((double)test->stores) ); } if ((test->instr_loop > 0) && (test->instr_const > 0)) { ownprintf("Instructions:\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->instr_loop*threads_data[0].data.iter + test->instr_const ); } if (test->uops > 0) { ownprintf("UOPs:\t\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->uops*threads_data[0].data.iter); } ownprintf(bdata(HLINE)); threads_destroy(numberOfWorkgroups, test->streams); allocator_finalize(); workgroups_destroy(&groups, numberOfWorkgroups, test->streams); #ifdef LIKWID_PERFMON if (getenv("LIKWID_FILEPATH") != NULL) { ownprintf("Writing Likwid Marker API results to file %s\n", getenv("LIKWID_FILEPATH")); } LIKWID_MARKER_CLOSE; #endif bdestroy(HLINE); return EXIT_SUCCESS; }
int main(int argc, char** argv) { int iter = 100; uint32_t i; uint32_t j; int globalNumberOfThreads = 0; int optPrintDomains = 0; int c; ThreadUserData myData; bstring testcase = bfromcstr("none"); uint32_t numberOfWorkgroups = 0; int tmp = 0; double time; const TestCase* test = NULL; Workgroup* currentWorkgroup = NULL; Workgroup* groups = NULL; cpuid_init(); numa_init(); affinity_init(); /* Handling of command line options */ if (argc == 1) { HELP_MSG; } while ((c = getopt (argc, argv, "g:w:t:i:l:aphv")) != -1) { switch (c) { case 'h': HELP_MSG; exit (EXIT_SUCCESS); case 'v': VERSION_MSG; exit (EXIT_SUCCESS); case 'a': printf(TESTS"\n"); exit (EXIT_SUCCESS); case 'w': tmp--; if (tmp == -1) { fprintf (stderr, "More workgroups configured than allocated!\n"); return EXIT_FAILURE; } if (!test) { fprintf (stderr, "You need to specify a test case first!\n"); return EXIT_FAILURE; } testcase = bfromcstr(optarg); currentWorkgroup = groups+tmp; /*FIXME*/ bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams); bdestroy(testcase); for (i=0; i< test->streams; i++) { if (currentWorkgroup->streams[i].offset%test->stride) { fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i); return EXIT_FAILURE; } allocator_allocateVector(&(currentWorkgroup->streams[i].ptr), PAGE_ALIGNMENT, currentWorkgroup->size, currentWorkgroup->streams[i].offset, test->type, currentWorkgroup->streams[i].domain); } break; case 'i': iter = atoi(optarg); break; case 'l': testcase = bfromcstr(optarg); for (i=0; i<NUMKERNELS; i++) { if (biseqcstr(testcase, kernels[i].name)) { test = kernels+i; break; } } if (biseqcstr(testcase,"none")) { fprintf (stderr, "Unknown test case %s\n",optarg); return EXIT_FAILURE; } else { printf("Name: %s\n",test->name); printf("Number of streams: %d\n",test->streams); printf("Loop stride: %d\n",test->stride); printf("Flops: %d\n",test->flops); printf("Bytes: %d\n",test->bytes); switch (test->type) { case SINGLE: printf("Data Type: Single precision float\n"); break; case DOUBLE: printf("Data Type: Double precision float\n"); break; } } bdestroy(testcase); exit (EXIT_SUCCESS); break; case 'p': optPrintDomains = 1; break; case 'g': numberOfWorkgroups = atoi(optarg); allocator_init(numberOfWorkgroups * MAX_STREAMS); tmp = numberOfWorkgroups; groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup)); break; case 't': testcase = bfromcstr(optarg); for (i=0; i<NUMKERNELS; i++) { if (biseqcstr(testcase, kernels[i].name)) { test = kernels+i; break; } } if (biseqcstr(testcase,"none")) { fprintf (stderr, "Unknown test case %s\n",optarg); return EXIT_FAILURE; } bdestroy(testcase); break; case '?': if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return EXIT_FAILURE; default: HELP_MSG; } } if (optPrintDomains) { affinity_printDomains(); exit (EXIT_SUCCESS); } timer_init(); /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread * module only allows equally sized thread groups*/ for (i=0; i<numberOfWorkgroups; i++) { globalNumberOfThreads += groups[i].numberOfThreads; } threads_init(globalNumberOfThreads); threads_createGroups(numberOfWorkgroups); /* we configure global barriers only */ barrier_init(1); barrier_registerGroup(globalNumberOfThreads); #ifdef PERFMON printf("Using likwid\n"); likwid_markerInit(); #endif /* initialize data structures for threads */ for (i=0; i<numberOfWorkgroups; i++) { myData.iter = iter; myData.size = groups[i].size; myData.test = test; myData.numberOfThreads = groups[i].numberOfThreads; myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int)); myData.streams = (void**) malloc(test->streams * sizeof(void*)); for (j=0; j<groups[i].numberOfThreads; j++) { myData.processors[j] = groups[i].processorIds[j]; } for (j=0; j< test->streams; j++) { myData.streams[j] = groups[i].streams[j].ptr; } threads_registerDataGroup(i, &myData, copyThreadData); free(myData.processors); free(myData.streams); } printf(HLINE); printf("LIKWID MICRO BENCHMARK\n"); printf("Test: %s\n",test->name); printf(HLINE); printf("Using %d work groups\n",numberOfWorkgroups); printf("Using %d threads\n",globalNumberOfThreads); printf(HLINE); threads_create(runTest); threads_destroy(); allocator_finalize(); time = (double) threads_data[0].cycles / (double) timer_getCpuClock(); printf("Cycles: %llu \n", LLU_CAST threads_data[0].cycles); printf("Iterations: %llu \n", LLU_CAST iter); printf("Size: %d \n", currentWorkgroup->size ); printf("Vectorlength: %d \n", threads_data[0].data.size); printf("Time: %e sec\n", time); printf("MFlops/s:\t%.2f\n", 1.0E-06 * ((double) numberOfWorkgroups * iter * currentWorkgroup->size * test->flops/ time)); printf("MByte/s:\t%.2f\n", 1.0E-06 * ( (double) numberOfWorkgroups * iter * currentWorkgroup->size * test->bytes/ time)); printf("Cycles per update:\t%f\n", ((double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size))); switch ( test->type ) { case SINGLE: printf("Cycles per cacheline:\t%f\n", (16.0 * (double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size))); break; case DOUBLE: printf("Cycles per cacheline:\t%f\n", (8.0 * (double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size))); break; } printf(HLINE); #ifdef PERFMON likwid_markerClose(); #endif return EXIT_SUCCESS; }
static int cpustr_to_cpulist_physical(bstring bcpustr, int* cpulist, int length) { topology_init(); CpuTopology_t cpuid_topology = get_cpuTopology(); affinity_init(); AffinityDomains_t affinity = get_affinityDomains(); bstring bdomain; bstring blist; int domainidx = -1; if (bstrchrp(bcpustr, ':', 0) != BSTR_ERR) { struct bstrList* strlist = bstrListCreate(); strlist = bsplit(bcpustr, ':'); bdomain = bstrcpy(strlist->entry[0]); blist = bstrcpy(strlist->entry[1]); bstrListDestroy(strlist); } else { bdomain = bformat("N"); blist = bstrcpy(bcpustr); } for (int i=0; i<affinity->numberOfAffinityDomains; i++) { if (bstrcmp(bdomain, affinity->domains[i].tag) == 0) { domainidx = i; break; } } if (domainidx < 0) { fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain)); bdestroy(bdomain); bdestroy(blist); return 0; } struct bstrList* strlist = bstrListCreate(); strlist = bsplit(blist, ','); int insert = 0; for (int i=0;i< strlist->qty; i++) { if (bstrchrp(strlist->entry[i], '-', 0) != BSTR_ERR) { struct bstrList* indexlist = bstrListCreate(); indexlist = bsplit(strlist->entry[i], '-'); if (atoi(bdata(indexlist->entry[0])) <= atoi(bdata(indexlist->entry[1]))) { for (int j=atoi(bdata(indexlist->entry[0])); j<=atoi(bdata(indexlist->entry[1]));j++) { if (cpu_in_domain(domainidx, j)) { cpulist[insert] = j; insert++; if (insert == length) { bstrListDestroy(indexlist); goto physical_done; } } else { fprintf(stderr, "CPU %d not in domain %s\n", j, bdata(affinity->domains[domainidx].tag)); } } } else { for (int j=atoi(bdata(indexlist->entry[0])); j>=atoi(bdata(indexlist->entry[1]));j--) { if (cpu_in_domain(domainidx, j)) { cpulist[insert] = j; insert++; if (insert == length) { bstrListDestroy(indexlist); goto physical_done; } } else { fprintf(stderr, "CPU %d not in domain %s\n", j, bdata(affinity->domains[domainidx].tag)); } } } bstrListDestroy(indexlist); } else { int cpu = atoi(bdata(strlist->entry[i])); if (cpu_in_domain(domainidx, cpu)) { cpulist[insert] = cpu; insert++; if (insert == length) { goto physical_done; } } else { fprintf(stderr, "CPU %d not in domain %s\n", cpu, bdata(affinity->domains[domainidx].tag)); } } } physical_done: bstrListDestroy(strlist); bdestroy(bdomain); bdestroy(blist); return insert; }
static int cpustr_to_cpulist_logical(bstring bcpustr, int* cpulist, int length) { topology_init(); CpuTopology_t cpuid_topology = get_cpuTopology(); affinity_init(); AffinityDomains_t affinity = get_affinityDomains(); int domainidx = -1; bstring bdomain; bstring blist; if (bstrchrp(bcpustr, 'L', 0) != 0) { fprintf(stderr, "Not a valid CPU expression\n"); return 0; } struct bstrList* strlist = bstrListCreate(); strlist = bsplit(bcpustr, ':'); if (strlist->qty != 3) { fprintf(stderr, "ERROR: Invalid expression, should look like L:<domain>:<indexlist> or be in a cpuset\n"); bstrListDestroy(strlist); return 0; } bdomain = bstrcpy(strlist->entry[1]); blist = bstrcpy(strlist->entry[2]); bstrListDestroy(strlist); for (int i=0; i<affinity->numberOfAffinityDomains; i++) { if (bstrcmp(bdomain, affinity->domains[i].tag) == 0) { domainidx = i; break; } } if (domainidx < 0) { fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain)); return 0; } int *inlist = malloc(affinity->domains[domainidx].numberOfProcessors * sizeof(int)); if (inlist == NULL) { return -ENOMEM; } int ret = cpulist_sort(affinity->domains[domainidx].processorList, inlist, affinity->domains[domainidx].numberOfProcessors); strlist = bstrListCreate(); strlist = bsplit(blist, ','); int insert = 0; for (int i=0; i< strlist->qty; i++) { if (bstrchrp(strlist->entry[i], '-', 0) != BSTR_ERR) { struct bstrList* indexlist = bstrListCreate(); indexlist = bsplit(strlist->entry[i], '-'); if (atoi(bdata(indexlist->entry[0])) <= atoi(bdata(indexlist->entry[1]))) { for (int j=atoi(bdata(indexlist->entry[0])); j<=atoi(bdata(indexlist->entry[1]));j++) { cpulist[insert] = inlist[j]; insert++; if (insert == length) { bstrListDestroy(indexlist); goto logical_done; } } } else { for (int j=atoi(bdata(indexlist->entry[0])); j>=atoi(bdata(indexlist->entry[1]));j--) { cpulist[insert] = inlist[j]; insert++; if (insert == length) { bstrListDestroy(indexlist); goto logical_done; } } } bstrListDestroy(indexlist); } else { cpulist[insert] = inlist[atoi(bdata(strlist->entry[i]))]; insert++; if (insert == length) { goto logical_done; } } } logical_done: free(inlist); bstrListDestroy(strlist); return insert; }
static int cpustr_to_cpulist_expression(bstring bcpustr, int* cpulist, int length) { topology_init(); CpuTopology_t cpuid_topology = get_cpuTopology(); affinity_init(); AffinityDomains_t affinity = get_affinityDomains(); bstring bdomain; int domainidx = -1; int count = 0; int stride = 0; int chunk = 0; if (bstrchrp(bcpustr, 'E', 0) != 0) { fprintf(stderr, "Not a valid CPU expression\n"); return 0; } struct bstrList* strlist = bstrListCreate(); strlist = bsplit(bcpustr, ':'); if (strlist->qty == 3) { bdomain = bstrcpy(strlist->entry[1]); count = atoi(bdata(strlist->entry[2])); stride = 1; chunk = 1; } else if (strlist->qty == 5) { bdomain = bstrcpy(strlist->entry[1]); count = atoi(bdata(strlist->entry[2])); chunk = atoi(bdata(strlist->entry[3])); stride = atoi(bdata(strlist->entry[4])); } for (int i=0; i<affinity->numberOfAffinityDomains; i++) { if (bstrcmp(bdomain, affinity->domains[i].tag) == 0) { domainidx = i; break; } } if (domainidx < 0) { fprintf(stderr, "Cannot find domain %s\n", bdata(bdomain)); bstrListDestroy(strlist); return 0; } int offset = 0; int insert = 0; for (int i=0;i<count;i++) { for (int j=0;j<chunk && offset+j<affinity->domains[domainidx].numberOfProcessors;j++) { cpulist[insert] = affinity->domains[domainidx].processorList[offset + j]; insert++; if (insert == length) goto expression_done; } offset += stride; if (offset >= affinity->domains[domainidx].numberOfProcessors) { offset = 0; } if (insert >= count) goto expression_done; } bstrListDestroy(strlist); return 0; expression_done: bstrListDestroy(strlist); return insert; }
int main(int argn, char** argc) { int err, i ,j; int numCPUs = 0; int gid; DATATYPE *a,*b,*c,*d; TimeData timer; double triad_time, copy_time, scale_time, stream_time; char estr[1024]; double result, scalar = 3.0; char* ptr; if (argn != 3) { printf("Usage: %s <cpustr> <events>\n", argc[0]); return 1; } strcpy(estr, argc[2]); allocate_vector(&a, SIZE); allocate_vector(&b, SIZE); allocate_vector(&c, SIZE); allocate_vector(&d, SIZE); err = topology_init(); if (err < 0) { printf("Failed to initialize LIKWID's topology module\n"); return 1; } CpuTopology_t topo = get_cpuTopology(); affinity_init(); int* cpus = (int*)malloc(topo->numHWThreads * sizeof(int)); if (!cpus) return 1; numCPUs = cpustr_to_cpulist(argc[1], cpus, topo->numHWThreads); omp_set_num_threads(numCPUs); err = perfmon_init(numCPUs, cpus); if (err < 0) { printf("Failed to initialize LIKWID's performance monitoring module\n"); affinity_finalize(); topology_finalize(); return 1; } gid = perfmon_addEventSet(estr); if (gid < 0) { printf("Failed to add event string %s to LIKWID's performance monitoring module\n", estr); perfmon_finalize(); affinity_finalize(); topology_finalize(); return 1; } err = perfmon_setupCounters(gid); if (err < 0) { printf("Failed to setup group %d in LIKWID's performance monitoring module\n", gid); perfmon_finalize(); affinity_finalize(); topology_finalize(); return 1; } #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { printf ("Number of Threads requested = %i\n",omp_get_num_threads()); } likwid_pinThread(cpus[omp_get_thread_num()]); printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),sched_getcpu()); } #endif #pragma omp parallel for for (int j=0; j<SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; d[j] = 1.0; } err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("copy"); #pragma omp for for (int j=0; j<SIZE; j++) { c[j] = a[j]; } LIKWID_MARKER_STOP("copy"); } } time_stop(&timer); err = perfmon_stopCounters(); copy_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at copy benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(DATATYPE)), copy_time, 1E-6*((2*SIZE*sizeof(DATATYPE))/copy_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } strcpy(estr, argc[2]); perfmon_setupCounters(gid); err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("scale"); #pragma omp for for (int j=0; j<SIZE; j++) { b[j] = scalar*c[j]; } LIKWID_MARKER_STOP("scale"); } } time_stop(&timer); err = perfmon_stopCounters(); scale_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at scale benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(DATATYPE)), copy_time, 1E-6*((2*SIZE*sizeof(DATATYPE))/copy_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } strcpy(estr, argc[2]); perfmon_setupCounters(gid); err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("stream"); #pragma omp for for (int j=0; j<SIZE; j++) { c[j] = a[j] + b[j]; } LIKWID_MARKER_STOP("stream"); } } time_stop(&timer); err = perfmon_stopCounters(); stream_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at stream benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(DATATYPE)), copy_time, 1E-6*((2*SIZE*sizeof(DATATYPE))/copy_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } strcpy(estr, argc[2]); perfmon_setupCounters(gid); err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("triad"); #pragma omp for for (int j=0; j<SIZE; j++) { a[j] = b[j] + c[j] * scalar; } LIKWID_MARKER_STOP("triad"); } } time_stop(&timer); err = perfmon_stopCounters(); triad_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at triad benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(4*SIZE*sizeof(DATATYPE)), triad_time, 1E-6*((4*SIZE*sizeof(DATATYPE))/triad_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } perfmon_finalize(); affinity_finalize(); topology_finalize(); return 0; }
int main(int argc, char* argv[]) { int i, j; int err; int* cpus; int gid; double result = 0.0; char estr[] = "L2_LINES_IN_ALL:PMC0,L2_TRANS_L2_WB:PMC1"; //perfmon_setVerbosity(3); // Load the topology module and print some values. err = topology_init(); if (err < 0) { printf("Failed to initialize LIKWID's topology module\n"); return 1; } // CpuInfo_t contains global information like name, CPU family, ... CpuInfo_t info = get_cpuInfo(); // CpuTopology_t contains information about the topology of the CPUs. CpuTopology_t topo = get_cpuTopology(); // Create affinity domains. Commonly only needed when reading Uncore counters affinity_init(); printf("Likwid example on a %s with %d CPUs\n", info->name, topo->numHWThreads); cpus = (int*)malloc(topo->numHWThreads * sizeof(int)); if (!cpus) return 1; for (i=0;i<topo->numHWThreads;i++) { cpus[i] = topo->threadPool[i].apicId; } // Must be called before perfmon_init() but only if you want to use another // access mode as the pre-configured one. For direct access (0) you have to // be root. //accessClient_setaccessmode(0); // Initialize the perfmon module. err = perfmon_init(topo->numHWThreads, cpus); if (err < 0) { printf("Failed to initialize LIKWID's performance monitoring module\n"); topology_finalize(); return 1; } // Add eventset string to the perfmon module. gid = perfmon_addEventSet(estr); if (gid < 0) { printf("Failed to add event string %s to LIKWID's performance monitoring module\n", estr); perfmon_finalize(); topology_finalize(); return 1; } // Setup the eventset identified by group ID (gid). err = perfmon_setupCounters(gid); if (err < 0) { printf("Failed to setup group %d in LIKWID's performance monitoring module\n", gid); perfmon_finalize(); topology_finalize(); return 1; } // Start all counters in the previously set up event set. err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } // Perform something sleep(10); // Stop all counters in the previously started event set. err = perfmon_stopCounters(); if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } // Print the result of every thread/CPU for all events in estr. char* ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < topo->numHWThreads; i++) { result = perfmon_getResult(gid, j, i); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } free(cpus); // Uninitialize the perfmon module. perfmon_finalize(); affinity_finalize(); // Uninitialize the topology module. topology_finalize(); return 0; }