示例#1
0
int test_timercpuclock_noinit()
{
    uint64_t cyc = timer_getCpuClock();
    if (cyc != 0)
        return 0;
    return 1;
}
示例#2
0
static int lua_likwid_getCpuClock(lua_State* L)
{
    if (timer_isInitialized == 0)
    {
        timer_init();
        timer_isInitialized = 1;
    }
    lua_pushnumber(L,timer_getCpuClock());
    return 1;
}
示例#3
0
int test_timercpuclock()
{
    timer_init();
    uint64_t cyc = timer_getCpuClock();
    if (cyc == 0)
        return 0;
    timer_finalize();
    return 1;
fail:
    timer_finalize();
    return 0;
}
示例#4
0
int test_timerinit()
{
    timer_init();
    uint64_t clock = timer_getCpuClock();
    if (clock == 0)
        goto fail;
    timer_finalize();
    return 1;
fail:
    timer_finalize();
    return 0;
}
示例#5
0
int test_timerprint_stop()
{
    TimerData timer;
    timer_init();
    timer_reset(&timer);
    timer_start(&timer);
    timer_stop(&timer);
    double time = timer_print(&timer);
    if (time > 1)
        goto fail;
    if (time == 0)
        goto fail;
    uint64_t cycles = timer_printCycles(&timer);
    if (cycles == 0)
        goto fail;
    if (cycles > timer_getCpuClock())
        goto fail;
    timer_finalize();
    return 1;
fail:
    timer_finalize();
    return 0;
}
示例#6
0
int main(int argc, char** argv)
{
    int iter = 100;
    uint32_t i;
    uint32_t j;
    int globalNumberOfThreads = 0;
    int optPrintDomains = 0;
    int c;
    ThreadUserData myData;
    bstring testcase = bfromcstr("none");
    uint32_t numberOfWorkgroups = 0;
    int tmp = 0;
    double time;
    const TestCase* test = NULL;
    Workgroup* currentWorkgroup = NULL;
    Workgroup* groups = NULL;

    cpuid_init();
    numa_init();
    affinity_init();

    /* Handling of command line options */
    if (argc ==  1) { HELP_MSG; }

    while ((c = getopt (argc, argv, "g:w:t:i:l:aphv")) != -1) {
        switch (c)
        {
            case 'h':
                HELP_MSG;
                exit (EXIT_SUCCESS);    
            case 'v':
                VERSION_MSG;
                exit (EXIT_SUCCESS);    
            case 'a':
                printf(TESTS"\n");
                exit (EXIT_SUCCESS);    
            case 'w':
                tmp--;

                if (tmp == -1)
                {
                    fprintf (stderr, "More workgroups configured than allocated!\n");
                    return EXIT_FAILURE;
                }
                if (!test)
                {
                    fprintf (stderr, "You need to specify a test case first!\n");
                    return EXIT_FAILURE;
                }
                testcase = bfromcstr(optarg);
                currentWorkgroup = groups+tmp;  /*FIXME*/
                bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams);
                bdestroy(testcase);

                for (i=0; i<  test->streams; i++)
                {
                    if (currentWorkgroup->streams[i].offset%test->stride)
                    {
                        fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i);
                        return EXIT_FAILURE;
                    }

                    allocator_allocateVector(&(currentWorkgroup->streams[i].ptr),
                            PAGE_ALIGNMENT,
                            currentWorkgroup->size,
                            currentWorkgroup->streams[i].offset,
                            test->type,
                            currentWorkgroup->streams[i].domain);
                }

                break;
            case 'i':
                iter =  atoi(optarg);
                break;
            case 'l':
                testcase = bfromcstr(optarg);
                for (i=0; i<NUMKERNELS; i++)
                {
                    if (biseqcstr(testcase, kernels[i].name))
                    {
                        test = kernels+i;
                        break;
                    }
                }

                if (biseqcstr(testcase,"none"))
                {
                    fprintf (stderr, "Unknown test case %s\n",optarg);
                    return EXIT_FAILURE;
                }
                else
                {
                    printf("Name: %s\n",test->name);
                    printf("Number of streams: %d\n",test->streams);
                    printf("Loop stride: %d\n",test->stride);
                    printf("Flops: %d\n",test->flops);
                    printf("Bytes: %d\n",test->bytes);
                    switch (test->type)
                    {
                        case SINGLE:
                            printf("Data Type: Single precision float\n");
                            break;
                        case DOUBLE:
                            printf("Data Type: Double precision float\n");
                            break;
                    }
                }
                bdestroy(testcase);
                exit (EXIT_SUCCESS);    

                break;
            case 'p':
                optPrintDomains = 1;
                break;
            case 'g':
                numberOfWorkgroups =  atoi(optarg);
                allocator_init(numberOfWorkgroups * MAX_STREAMS);
                tmp = numberOfWorkgroups;
                groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
                break;
            case 't':
                testcase = bfromcstr(optarg);

                for (i=0; i<NUMKERNELS; i++)
                {
                    if (biseqcstr(testcase, kernels[i].name))
                    {
                        test = kernels+i;
                        break;
                    }
                }

                if (biseqcstr(testcase,"none"))
                {
                    fprintf (stderr, "Unknown test case %s\n",optarg);
                    return EXIT_FAILURE;
                }
                bdestroy(testcase);
                break;
            case '?':
                if (isprint (optopt))
                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                else
                    fprintf (stderr,
                            "Unknown option character `\\x%x'.\n",
                            optopt);
                return EXIT_FAILURE;
            default:
                HELP_MSG;
        }
    }


    if (optPrintDomains)
    {
        affinity_printDomains();
        exit (EXIT_SUCCESS);    
    }
    timer_init();

 /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
  * module only allows equally sized thread groups*/
    for (i=0; i<numberOfWorkgroups; i++)
    {
        globalNumberOfThreads += groups[i].numberOfThreads;
    }

    threads_init(globalNumberOfThreads);
    threads_createGroups(numberOfWorkgroups);

    /* we configure global barriers only */
    barrier_init(1);
    barrier_registerGroup(globalNumberOfThreads);

#ifdef PERFMON
    printf("Using likwid\n");
    likwid_markerInit();
#endif


    /* initialize data structures for threads */
    for (i=0; i<numberOfWorkgroups; i++)
    {
        myData.iter = iter;
        myData.size = groups[i].size;
        myData.test = test;
        myData.numberOfThreads = groups[i].numberOfThreads;
        myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
        myData.streams = (void**) malloc(test->streams * sizeof(void*));

        for (j=0; j<groups[i].numberOfThreads; j++)
        {
            myData.processors[j] = groups[i].processorIds[j];
        }

        for (j=0; j<  test->streams; j++)
        {
            myData.streams[j] = groups[i].streams[j].ptr;
        }

        threads_registerDataGroup(i, &myData, copyThreadData);

        free(myData.processors);
        free(myData.streams);
    }

    printf(HLINE);
    printf("LIKWID MICRO BENCHMARK\n"); 
    printf("Test: %s\n",test->name); 
    printf(HLINE);
    printf("Using %d work groups\n",numberOfWorkgroups);
    printf("Using %d threads\n",globalNumberOfThreads);
    printf(HLINE);

    threads_create(runTest); 
    threads_destroy();
    allocator_finalize();

    time = (double) threads_data[0].cycles / (double) timer_getCpuClock();
    printf("Cycles: %llu \n", LLU_CAST threads_data[0].cycles);
    printf("Iterations: %llu \n", LLU_CAST iter);
    printf("Size: %d \n",  currentWorkgroup->size );
    printf("Vectorlength: %d \n", threads_data[0].data.size);
    printf("Time: %e sec\n", time);
    printf("MFlops/s:\t%.2f\n",
            1.0E-06 * ((double) numberOfWorkgroups * iter * currentWorkgroup->size *  test->flops/  time));
    printf("MByte/s:\t%.2f\n",
            1.0E-06 * ( (double) numberOfWorkgroups * iter * currentWorkgroup->size *  test->bytes/ time));
    printf("Cycles per update:\t%f\n",
            ((double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size)));

	switch ( test->type )
    {
        case SINGLE:
    printf("Cycles per cacheline:\t%f\n",
            (16.0 * (double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size)));
            break;
        case DOUBLE:
    printf("Cycles per cacheline:\t%f\n",
            (8.0 * (double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size)));
            break;
    }

    printf(HLINE);
#ifdef PERFMON
   likwid_markerClose();
#endif

    return EXIT_SUCCESS;
}
示例#7
0
int main (int argc, char** argv)
{
    int socket_fd = -1;
    int optInfo = 0;
    int optClock = 0;
    int optStethoscope = 0;
    int optSockets = 0;
    double runtime;
    int hasDRAM = 0;
    int c;
    bstring argString;
    bstring eventString = bfromcstr("CLOCK");
    int numSockets=1;
    int numThreads=0;
    int threadsSockets[MAX_NUM_NODES*2];
    int threads[MAX_NUM_THREADS];

    threadsSockets[0] = 0;
    
    if (argc == 1)
    {
    	HELP_MSG;
    	exit (EXIT_SUCCESS);
    }

    while ((c = getopt (argc, argv, "+c:hiM:ps:v")) != -1)
    {
        switch (c)
        {
            case 'c':
                CHECK_OPTION_STRING;
                numSockets = bstr_to_cpuset_physical((uint32_t*) threadsSockets, argString);
                bdestroy(argString);
                optSockets = 1;
                break;

            case 'h':
                HELP_MSG;
                exit (EXIT_SUCCESS);
            case 'i':
                optInfo = 1;
                break;
            case 'M':  /* Set MSR Access mode */
                CHECK_OPTION_STRING;
                accessClient_setaccessmode(str2int((char*) argString->data));
                bdestroy(argString);
                break;
            case 'p':
                optClock = 1;
                break;
            case 's':
                CHECK_OPTION_STRING;
                optStethoscope = str2int((char*) argString->data);
                bdestroy(argString);
                break;
            case 'v':
                VERSION_MSG;
                exit (EXIT_SUCCESS);
            case '?':
            	if (optopt == 's' || optopt == 'M' || optopt == 'c')
            	{
            		HELP_MSG;
            	}
                else if (isprint (optopt))
                {
                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                }
                else
                {
                    fprintf (stderr,
                            "Unknown option character `\\x%x'.\n",
                            optopt);
                }
                exit( EXIT_FAILURE);
            default:
                HELP_MSG;
                exit (EXIT_SUCCESS);
        }
    }

    if (!lock_check())
    {
        fprintf(stderr,"Access to performance counters is locked.\n");
        exit(EXIT_FAILURE);
    }
    
    if (optClock && optind == argc)
    {
    	fprintf(stderr,"Commandline option -p requires an executable.\n");
    	exit(EXIT_FAILURE);
    }
    if (optSockets && !optStethoscope && optind == argc)
    {
    	fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n");
    	exit(EXIT_FAILURE);
    }

    if (cpuid_init() == EXIT_FAILURE)
    {
        fprintf(stderr, "CPU not supported\n");
        exit(EXIT_FAILURE);
    }
    
    if (numSockets > cpuid_topology.numSockets)
    {
    	fprintf(stderr, "System has only %d sockets but %d are given on commandline\n",
    			cpuid_topology.numSockets, numSockets);
    	exit(EXIT_FAILURE);
    }

    numa_init(); /* consider NUMA node as power unit for the moment */
    accessClient_init(&socket_fd);
    msr_init(socket_fd);
    timer_init();

    /* check for supported processors */
    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
            (cpuid_info.model == SANDYBRIDGE) ||
            (cpuid_info.model == IVYBRIDGE) ||
            (cpuid_info.model == IVYBRIDGE_EP) ||
            (cpuid_info.model == HASWELL) ||
            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
            (cpuid_info.model == NEHALEM_WESTMERE))
    {
        power_init(numa_info.nodes[0].processors[0]);
    }
    else
    {
        fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell processors!\n");
        exit(EXIT_FAILURE);
    }

    double clock = (double) timer_getCpuClock();

    printf(HLINE);
    printf("CPU name:\t%s \n",cpuid_info.name);
    printf("CPU clock:\t%3.2f GHz \n",  (float) clock * 1.E-09);
    printf(HLINE);

    if (optInfo)
    {
        if (power_info.turbo.numSteps != 0)
        {
            printf("Base clock:\t%.2f MHz \n",  power_info.baseFrequency );
            printf("Minimal clock:\t%.2f MHz \n",  power_info.minFrequency );
            printf("Turbo Boost Steps:\n");
            for (int i=0; i < power_info.turbo.numSteps; i++ )
            {
                printf("C%d %.2f MHz \n",i+1,  power_info.turbo.steps[i] );
            }
        }
        printf(HLINE);
    }

    if (cpuid_info.model == SANDYBRIDGE_EP)
    {
        hasDRAM = 1;
    }
    else if ((cpuid_info.model != SANDYBRIDGE) &&
            (cpuid_info.model != SANDYBRIDGE_EP)  &&
            (cpuid_info.model != IVYBRIDGE)  &&
            (cpuid_info.model != IVYBRIDGE_EP)  &&
            (cpuid_info.model != HASWELL))
    {
        fprintf (stderr, "RAPL not supported on this processor!\n");
        exit(EXIT_FAILURE);
    }

    if (optInfo)
    {
        printf("Thermal Spec Power: %g Watts \n", power_info.tdp );
        printf("Minimum  Power: %g Watts \n", power_info.minPower);
        printf("Maximum  Power: %g Watts \n", power_info.maxPower);
        printf("Maximum  Time Window: %g micro sec \n", power_info.maxTimeWindow);
        printf(HLINE);
        exit(EXIT_SUCCESS);
    }

    if (optClock)
    {
        affinity_init();
        argString = bformat("S%u:0-%u", threadsSockets[0], cpuid_topology.numCoresPerSocket-1);
        for (int i=1; i<numSockets; i++)
        {
            bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i], cpuid_topology.numCoresPerSocket-1);
            bconcat(argString, tExpr);
        }
        numThreads = bstr_to_cpuset(threads, argString);
        bdestroy(argString);
        perfmon_init(numThreads, threads, stdout);
        perfmon_setupEventSet(eventString, NULL);
    }

    {
        PowerData pDataPkg[MAX_NUM_NODES*2];
        PowerData pDataDram[MAX_NUM_NODES*2];
        printf("Measure on sockets: %d", threadsSockets[0]);
        for (int i=1; i<numSockets; i++)
        {
            printf(", %d", threadsSockets[i]);
        }
        printf("\n");

        if (optStethoscope)
        {
            if (optClock)
            {
                perfmon_startCounters();
            }
            else
            {
                for (int i=0; i<numSockets; i++)
                {
                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
                    if (hasDRAM) power_start(pDataDram+i, cpuId, DRAM);
                    power_start(pDataPkg+i, cpuId, PKG);
                }
            }
            sleep(optStethoscope);

            if (optClock)
            {
                perfmon_stopCounters();
                perfmon_printCounterResults();
                perfmon_finalize();
            }
            else
            {
                for (int i=0; i<numSockets; i++)
                {
                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
                    power_stop(pDataPkg+i, cpuId, PKG);
                    if (hasDRAM) power_stop(pDataDram+i, cpuId, DRAM);
                }
            }
            runtime = (double) optStethoscope;
        }
        else
        {
            TimerData time;
            argv +=  optind;
            bstring exeString = bfromcstr(argv[0]);

            for (int i=1; i<(argc-optind); i++)
            {
                bconchar(exeString, ' ');
                bcatcstr(exeString, argv[i]);
            }
            printf("%s\n",bdata(exeString));


            if (optClock)
            {
                perfmon_startCounters();
            }
            else
            {
                for (int i=0; i<numSockets; i++)
                {
                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
                    if (hasDRAM) power_start(pDataDram+i, cpuId, DRAM);
                    power_start(pDataPkg+i, cpuId, PKG);
                }

                timer_start(&time);
            }

            if (system(bdata(exeString)) == EOF)
            {
                fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
                exit(EXIT_FAILURE);
            }

            if (optClock)
            {
                perfmon_stopCounters();
                perfmon_printCounterResults();
                perfmon_finalize();
            }
            else
            {
                timer_stop(&time);

                for (int i=0; i<numSockets; i++)
                {
                    int cpuId = numa_info.nodes[threadsSockets[i]].processors[0];
                    power_stop(pDataPkg+i, cpuId, PKG);
                    if (hasDRAM) power_stop(pDataDram+i, cpuId, DRAM);
                }
                runtime = timer_print(&time);
            }
        }

        if (!optClock)
        {
            printf("Runtime: %g second \n",runtime);
            printf(HLINE);
            for (int i=0; i<numSockets; i++)
            {
                printf("Socket %d\n",threadsSockets[i]);
                printf("Domain: PKG \n");
                printf("Energy consumed: %g Joules \n", power_printEnergy(pDataPkg+i));
                printf("Power consumed: %g Watts \n", power_printEnergy(pDataPkg+i) / runtime );
                if (hasDRAM)
                {
                    printf("Domain: DRAM \n");
                    printf("Energy consumed: %g Joules \n", power_printEnergy(pDataDram+i));
                    printf("Power consumed: %g Watts \n", power_printEnergy(pDataDram+i) / runtime );
                }
                printf("\n");
            }
        }
    }

#if 0
    if ( cpuid_hasFeature(TM2) )
    {
        thermal_init(0);
        printf("Current core temperatures:\n");

        for (uint32_t i = 0; i < cpuid_topology.numCoresPerSocket; i++ )
        {
            printf("Core %d: %u C\n",
                    numa_info.nodes[socketId].processors[i],
                    thermal_read(numa_info.nodes[socketId].processors[i]));
        }
    }
#endif

    msr_finalize();
    return EXIT_SUCCESS;
}
示例#8
0
int main(int argc, char** argv)
{
    uint64_t iter = 100;
    uint32_t i;
    uint32_t j;
    int globalNumberOfThreads = 0;
    int optPrintDomains = 0;
    int c;
    ThreadUserData myData;
    bstring testcase = bfromcstr("none");
    uint64_t numberOfWorkgroups = 0;
    int tmp = 0;
    double time;
    double cycPerUp = 0.0;
    const TestCase* test = NULL;
    uint64_t realSize = 0;
    uint64_t realIter = 0;
    uint64_t maxCycles = 0;
    uint64_t minCycles = UINT64_MAX;
    uint64_t cyclesClock = 0;
    uint64_t demandIter = 0;
    TimerData itertime;
    Workgroup* currentWorkgroup = NULL;
    Workgroup* groups = NULL;
    uint32_t min_runtime = 1; /* 1s */
    bstring HLINE = bfromcstr("");
    binsertch(HLINE, 0, 80, '-');
    binsertch(HLINE, 80, 1, '\n');
    int (*ownprintf)(const char *format, ...);
    ownprintf = &printf;

    /* Handling of command line options */
    if (argc ==  1)
    {
        HELP_MSG;
        exit(EXIT_SUCCESS);
    }

    while ((c = getopt (argc, argv, "w:t:s:l:aphvi:")) != -1) {
        switch (c)
        {
            case 'h':
                HELP_MSG;
                exit (EXIT_SUCCESS);
            case 'v':
                VERSION_MSG;
                exit (EXIT_SUCCESS);
            case 'a':
                ownprintf(TESTS"\n");
                exit (EXIT_SUCCESS);
            case 'w':
                numberOfWorkgroups++;
                break;
            case 's':
                min_runtime = atoi(optarg);
                break;
            case 'i':
                demandIter = strtoul(optarg, NULL, 10);
                if (demandIter <= 0)
                {
                    fprintf (stderr, "Error: Iterations must be greater than 0\n");
                    return EXIT_FAILURE;
                }
                break;
            case 'l':
                bdestroy(testcase);
                testcase = bfromcstr(optarg);
                for (i=0; i<NUMKERNELS; i++)
                {
                    if (biseqcstr(testcase, kernels[i].name))
                    {
                        test = kernels+i;
                        break;
                    }
                }

                if (test == NULL)
                {
                    fprintf (stderr, "Error: Unknown test case %s\n",optarg);
                    return EXIT_FAILURE;
                }
                else
                {
                    ownprintf("Name: %s\n",test->name);
                    ownprintf("Number of streams: %d\n",test->streams);
                    ownprintf("Loop stride: %d\n",test->stride);
                    ownprintf("Flops: %d\n",test->flops);
                    ownprintf("Bytes: %d\n",test->bytes);
                    switch (test->type)
                    {
                        case INT:
                            ownprintf("Data Type: Integer\n");
                            break;
                        case SINGLE:
                            ownprintf("Data Type: Single precision float\n");
                            break;
                        case DOUBLE:
                            ownprintf("Data Type: Double precision float\n");
                            break;
                    }
                    if (test->loads >= 0)
                    {
                        ownprintf("Load Ops: %d\n",test->loads);
                    }
                    if (test->stores >= 0)
                    {
                        ownprintf("Store Ops: %d\n",test->stores);
                    }
                    if (test->branches >= 0)
                    {
                        ownprintf("Branches: %d\n",test->branches);
                    }
                    if (test->instr_const >= 0)
                    {
                        ownprintf("Constant instructions: %d\n",test->instr_const);
                    }
                    if (test->instr_loop >= 0)
                    {
                        ownprintf("Loop instructions: %d\n",test->instr_loop);
                    }
                }
                bdestroy(testcase);
                exit (EXIT_SUCCESS);

                break;
            case 'p':
                optPrintDomains = 1;
                break;
            case 'g':
                numberOfWorkgroups = LLU_CAST atol(optarg);

                tmp = numberOfWorkgroups;

                break;
            case 't':
                bdestroy(testcase);
                testcase = bfromcstr(optarg);

                for (i=0; i<NUMKERNELS; i++)
                {
                    if (biseqcstr(testcase, kernels[i].name))
                    {
                        test = kernels+i;
                        break;
                    }
                }

                if (test == NULL)
                {
                    fprintf (stderr, "Error: Unknown test case %s\n",optarg);
                    return EXIT_FAILURE;
                }
                bdestroy(testcase);
                break;
            case '?':
                if (isprint (optopt))
                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                else
                    fprintf (stderr,
                            "Unknown option character `\\x%x'.\n",
                            optopt);
                return EXIT_FAILURE;
            default:
                HELP_MSG;
        }
    }
    if ((numberOfWorkgroups == 0) && (!optPrintDomains))
    {
        fprintf(stderr, "Error: At least one workgroup (-w) must be set on commandline\n");
        exit (EXIT_FAILURE);
    }

    if (topology_init() != EXIT_SUCCESS)
    {
        fprintf(stderr, "Error: Unsupported processor!\n");
        exit(EXIT_FAILURE);
    }

    if ((test == NULL) && (!optPrintDomains))
    {
        fprintf(stderr, "Unknown test case. Please check likwid-bench -a for available tests\n");
        fprintf(stderr, "and select one using the -t commandline option\n");
        exit(EXIT_FAILURE);
    }

    numa_init();
    affinity_init();
    timer_init();

    if (optPrintDomains)
    {
        bdestroy(testcase);
        AffinityDomains_t affinity = get_affinityDomains();
        ownprintf("Number of Domains %d\n",affinity->numberOfAffinityDomains);
        for (i=0; i < affinity->numberOfAffinityDomains; i++ )
        {
            ownprintf("Domain %d:\n",i);
            ownprintf("\tTag %s:",bdata(affinity->domains[i].tag));

            for ( uint32_t j=0; j < affinity->domains[i].numberOfProcessors; j++ )
            {
                ownprintf(" %d",affinity->domains[i].processorList[j]);
            }
            ownprintf("\n");
        }
        exit (EXIT_SUCCESS);
    }

    allocator_init(numberOfWorkgroups * MAX_STREAMS);
    groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
    tmp = 0;

    optind = 0;
    while ((c = getopt (argc, argv, "w:t:s:l:i:aphv")) != -1)
    {
        switch (c)
        {
            case 'w':
                currentWorkgroup = groups+tmp;
                bstring groupstr = bfromcstr(optarg);
                i = bstr_to_workgroup(currentWorkgroup, groupstr, test->type, test->streams);
                bdestroy(groupstr);
                if (i == 0)
                {
                    for (i=0; i<  test->streams; i++)
                    {
                        if (currentWorkgroup->streams[i].offset%test->stride)
                        {
                            fprintf (stderr, "Error: Stream %d: offset is not a multiple of stride!\n",i);
                            return EXIT_FAILURE;
                        }
                        allocator_allocateVector(&(currentWorkgroup->streams[i].ptr),
                                PAGE_ALIGNMENT,
                                currentWorkgroup->size,
                                currentWorkgroup->streams[i].offset,
                                test->type,
                                currentWorkgroup->streams[i].domain);
                    }
                    tmp++;
                }
                else
                {
                    exit(EXIT_FAILURE);
                }
                break;
            default:
                continue;
                break;
        }
    }

    /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
     * module only allows equally sized thread groups*/
    for (i=0; i<numberOfWorkgroups; i++)
    {
        globalNumberOfThreads += groups[i].numberOfThreads;
    }

    ownprintf(bdata(HLINE));
    ownprintf("LIKWID MICRO BENCHMARK\n");
    ownprintf("Test: %s\n",test->name);
    ownprintf(bdata(HLINE));
    ownprintf("Using %" PRIu64 " work groups\n",numberOfWorkgroups);
    ownprintf("Using %d threads\n",globalNumberOfThreads);
    ownprintf(bdata(HLINE));


    threads_init(globalNumberOfThreads);
    threads_createGroups(numberOfWorkgroups);

    /* we configure global barriers only */
    barrier_init(1);
    barrier_registerGroup(globalNumberOfThreads);
    cyclesClock = timer_getCycleClock();

#ifdef LIKWID_PERFMON
    if (getenv("LIKWID_FILEPATH") != NULL)
    {
        ownprintf("Using Likwid Marker API\n");
    }
    LIKWID_MARKER_INIT;
    ownprintf(bdata(HLINE));
#endif


    /* initialize data structures for threads */
    for (i=0; i<numberOfWorkgroups; i++)
    {
        myData.iter = iter;
        if (demandIter > 0)
        {
            myData.iter = demandIter;
        }
        myData.min_runtime = min_runtime;
        myData.size = groups[i].size;
        myData.test = test;
        myData.cycles = 0;
        myData.numberOfThreads = groups[i].numberOfThreads;
        myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
        myData.streams = (void**) malloc(test->streams * sizeof(void*));

        for (j=0; j<groups[i].numberOfThreads; j++)
        {
            myData.processors[j] = groups[i].processorIds[j];
        }

        for (j=0; j<  test->streams; j++)
        {
            myData.streams[j] = groups[i].streams[j].ptr;
        }

        threads_registerDataGroup(i, &myData, copyThreadData);

        free(myData.processors);
        free(myData.streams);
    }

    if (demandIter == 0)
    {
        getIterSingle((void*) &threads_data[0]);
        for (i=0; i<numberOfWorkgroups; i++)
        {
            iter = threads_updateIterations(i, demandIter);
        }
    }
#ifdef DEBUG_LIKWID
    else
    {
        ownprintf("Using manually selected iterations per thread\n");
    }
#endif

    threads_create(runTest);
    threads_join();

    for (int i=0; i<globalNumberOfThreads; i++)
    {
        realSize += threads_data[i].data.size;
        realIter += threads_data[i].data.iter;
        if (threads_data[i].cycles > maxCycles)
        {
            maxCycles = threads_data[i].cycles;
        }
        if (threads_data[i].cycles < minCycles)
        {
            minCycles = threads_data[i].cycles;
        }
    }



    time = (double) maxCycles / (double) cyclesClock;
    ownprintf(bdata(HLINE));
    ownprintf("Cycles:\t\t\t%" PRIu64 "\n", maxCycles);
    ownprintf("CPU Clock:\t\t%" PRIu64 "\n", timer_getCpuClock());
    ownprintf("Cycle Clock:\t\t%" PRIu64 "\n", cyclesClock);
    ownprintf("Time:\t\t\t%e sec\n", time);
    ownprintf("Iterations:\t\t%" PRIu64 "\n", realIter);
    ownprintf("Iterations per thread:\t%" PRIu64 "\n",threads_data[0].data.iter);
    ownprintf("Inner loop executions:\t%.0f\n", ((double)realSize)/((double)test->stride));
    ownprintf("Size:\t\t\t%" PRIu64 "\n",  realSize*test->bytes );
    ownprintf("Size per thread:\t%" PRIu64 "\n", threads_data[0].data.size*test->bytes);
    ownprintf("Number of Flops:\t%" PRIu64 "\n", (threads_data[0].data.iter * realSize *  test->flops));
    ownprintf("MFlops/s:\t\t%.2f\n",
            1.0E-06 * ((double) threads_data[0].data.iter * realSize *  test->flops/  time));
    
    ownprintf("Data volume (Byte):\t%llu\n", LLU_CAST (threads_data[0].data.iter * realSize *  test->bytes));
    ownprintf("MByte/s:\t\t%.2f\n",
            1.0E-06 * ( (double) threads_data[0].data.iter * realSize *  test->bytes/ time));

    cycPerUp = ((double) maxCycles / (double) (threads_data[0].data.iter * realSize));
    ownprintf("Cycles per update:\t%f\n", cycPerUp);

    switch ( test->type )
    {
        case INT:
        case SINGLE:
            ownprintf("Cycles per cacheline:\t%f\n", (16.0 * cycPerUp));
            break;
        case DOUBLE:
            ownprintf("Cycles per cacheline:\t%f\n", (8.0 * cycPerUp));
            break;
    }
    ownprintf("Loads per update:\t%ld\n", test->loads );
    ownprintf("Stores per update:\t%ld\n", test->stores );
    if ((test->loads > 0) && (test->stores > 0))
    {
        ownprintf("Load/store ratio:\t%.2f\n", ((double)test->loads)/((double)test->stores) );
    }
    if ((test->instr_loop > 0) && (test->instr_const > 0))
    {
        ownprintf("Instructions:\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->instr_loop*threads_data[0].data.iter + test->instr_const );
    }
    if (test->uops > 0)
    {
        ownprintf("UOPs:\t\t\t%" PRIu64 "\n", LLU_CAST ((double)realSize/test->stride)*test->uops*threads_data[0].data.iter);
    }

    ownprintf(bdata(HLINE));
    threads_destroy(numberOfWorkgroups, test->streams);
    allocator_finalize();
    workgroups_destroy(&groups, numberOfWorkgroups, test->streams);

#ifdef LIKWID_PERFMON
    if (getenv("LIKWID_FILEPATH") != NULL)
    {
        ownprintf("Writing Likwid Marker API results to file %s\n", getenv("LIKWID_FILEPATH"));
    }
    LIKWID_MARKER_CLOSE;
#endif

    bdestroy(HLINE);
    return EXIT_SUCCESS;
}