int main(int argc, char **argv) { ///****************************************************** ///********************** INPUT ************************* ///****************************************************** if (argc != 4) { std::cout << "Invalid number of arguments!" << std::endl; std::cout << "./compare A.out B.out" << std::endl; exit(EXIT_FAILURE); } //matrix dimensions int dimM = 0; int dimN = 0; int dimO = 0; //get dimensions std::ifstream fIn(argv[1]); if (!fIn) { std::cout << "Error opening file: " << argv[1] << std::endl; exit(EXIT_FAILURE); } if(!(fIn >> dimM >> dimN)) { std::cout << "Error in reading matrix entries!" << std::endl; exit(EXIT_FAILURE); } fIn.close(); fIn.open(argv[2]); if (!fIn) { std::cout << "Error opening file: " << argv[2] << std::endl; exit(EXIT_FAILURE); } if(!(fIn >> dimN >> dimO)) { std::cout << "Error in reading matrix entries!" << std::endl; exit(EXIT_FAILURE); } fIn.close(); //calculate minimal matrix size //all matrices are padded with 0s to this size //should be power of 2 for efficient block division //dirty hack... LD = 64; if (LD<dimM) LD = dimM; if (LD<dimN) LD = dimN; if (LD<dimO) LD = dimO; LD--; LD |= LD >> 1; LD |= LD >> 2; LD |= LD >> 4; LD |= LD >> 8; LD |= LD >> 16; LD++; //add useless padding LD += PADDING; double* a = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD); double* b = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD); double* c = (double*) aligned_alloc(ALIGNMENT, sizeof(double) * LD * LD); Matrix A = loadMatrix(argv[1], &a[0]); Matrix B = loadMatrix(argv[2], &b[0]); Matrix C(&c[0], nullptr, A.getDimM(), B.getDimN(), 0, 0); ///****************************************************** ///********************** CALCULATION ******************* ///****************************************************** double time = 0; #ifdef USE_LIKWID likwid_markerInit(); likwid_markerStartRegion("dummy"); #endif siwir::Timer timer; MMM(A, B, C); time = timer.elapsed(); std::cout << dimM << "\t" << dimN << "\t" << dimO << "\t" << time << std::endl; #ifdef USE_LIKWID likwid_markerStopRegion("dummy"); likwid_markerClose(); #endif ///****************************************************** ///********************** OUTPUT ************************ ///****************************************************** saveMatrix(argv[3], C); free(a); free(b); free(c); };
int main(int argc, char** argv) { int iter = 100; uint32_t i; uint32_t j; int globalNumberOfThreads = 0; int optPrintDomains = 0; int c; ThreadUserData myData; bstring testcase = bfromcstr("none"); uint32_t numberOfWorkgroups = 0; int tmp = 0; double time; const TestCase* test = NULL; Workgroup* currentWorkgroup = NULL; Workgroup* groups = NULL; cpuid_init(); numa_init(); affinity_init(); /* Handling of command line options */ if (argc == 1) { HELP_MSG; } while ((c = getopt (argc, argv, "g:w:t:i:l:aphv")) != -1) { switch (c) { case 'h': HELP_MSG; exit (EXIT_SUCCESS); case 'v': VERSION_MSG; exit (EXIT_SUCCESS); case 'a': printf(TESTS"\n"); exit (EXIT_SUCCESS); case 'w': tmp--; if (tmp == -1) { fprintf (stderr, "More workgroups configured than allocated!\n"); return EXIT_FAILURE; } if (!test) { fprintf (stderr, "You need to specify a test case first!\n"); return EXIT_FAILURE; } testcase = bfromcstr(optarg); currentWorkgroup = groups+tmp; /*FIXME*/ bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams); bdestroy(testcase); for (i=0; i< test->streams; i++) { if (currentWorkgroup->streams[i].offset%test->stride) { fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i); return EXIT_FAILURE; } allocator_allocateVector(&(currentWorkgroup->streams[i].ptr), PAGE_ALIGNMENT, currentWorkgroup->size, currentWorkgroup->streams[i].offset, test->type, currentWorkgroup->streams[i].domain); } break; case 'i': iter = atoi(optarg); break; case 'l': testcase = bfromcstr(optarg); for (i=0; i<NUMKERNELS; i++) { if (biseqcstr(testcase, kernels[i].name)) { test = kernels+i; break; } } if (biseqcstr(testcase,"none")) { fprintf (stderr, "Unknown test case %s\n",optarg); return EXIT_FAILURE; } else { printf("Name: %s\n",test->name); printf("Number of streams: %d\n",test->streams); printf("Loop stride: %d\n",test->stride); printf("Flops: %d\n",test->flops); printf("Bytes: %d\n",test->bytes); switch (test->type) { case SINGLE: printf("Data Type: Single precision float\n"); break; case DOUBLE: printf("Data Type: Double precision float\n"); break; } } bdestroy(testcase); exit (EXIT_SUCCESS); break; case 'p': optPrintDomains = 1; break; case 'g': numberOfWorkgroups = atoi(optarg); allocator_init(numberOfWorkgroups * MAX_STREAMS); tmp = numberOfWorkgroups; groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup)); break; case 't': testcase = bfromcstr(optarg); for (i=0; i<NUMKERNELS; i++) { if (biseqcstr(testcase, kernels[i].name)) { test = kernels+i; break; } } if (biseqcstr(testcase,"none")) { fprintf (stderr, "Unknown test case %s\n",optarg); return EXIT_FAILURE; } bdestroy(testcase); break; case '?': if (isprint (optopt)) fprintf (stderr, "Unknown option `-%c'.\n", optopt); else fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt); return EXIT_FAILURE; default: HELP_MSG; } } if (optPrintDomains) { affinity_printDomains(); exit (EXIT_SUCCESS); } timer_init(); /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread * module only allows equally sized thread groups*/ for (i=0; i<numberOfWorkgroups; i++) { globalNumberOfThreads += groups[i].numberOfThreads; } threads_init(globalNumberOfThreads); threads_createGroups(numberOfWorkgroups); /* we configure global barriers only */ barrier_init(1); barrier_registerGroup(globalNumberOfThreads); #ifdef PERFMON printf("Using likwid\n"); likwid_markerInit(); #endif /* initialize data structures for threads */ for (i=0; i<numberOfWorkgroups; i++) { myData.iter = iter; myData.size = groups[i].size; myData.test = test; myData.numberOfThreads = groups[i].numberOfThreads; myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int)); myData.streams = (void**) malloc(test->streams * sizeof(void*)); for (j=0; j<groups[i].numberOfThreads; j++) { myData.processors[j] = groups[i].processorIds[j]; } for (j=0; j< test->streams; j++) { myData.streams[j] = groups[i].streams[j].ptr; } threads_registerDataGroup(i, &myData, copyThreadData); free(myData.processors); free(myData.streams); } printf(HLINE); printf("LIKWID MICRO BENCHMARK\n"); printf("Test: %s\n",test->name); printf(HLINE); printf("Using %d work groups\n",numberOfWorkgroups); printf("Using %d threads\n",globalNumberOfThreads); printf(HLINE); threads_create(runTest); threads_destroy(); allocator_finalize(); time = (double) threads_data[0].cycles / (double) timer_getCpuClock(); printf("Cycles: %llu \n", LLU_CAST threads_data[0].cycles); printf("Iterations: %llu \n", LLU_CAST iter); printf("Size: %d \n", currentWorkgroup->size ); printf("Vectorlength: %d \n", threads_data[0].data.size); printf("Time: %e sec\n", time); printf("MFlops/s:\t%.2f\n", 1.0E-06 * ((double) numberOfWorkgroups * iter * currentWorkgroup->size * test->flops/ time)); printf("MByte/s:\t%.2f\n", 1.0E-06 * ( (double) numberOfWorkgroups * iter * currentWorkgroup->size * test->bytes/ time)); printf("Cycles per update:\t%f\n", ((double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size))); switch ( test->type ) { case SINGLE: printf("Cycles per cacheline:\t%f\n", (16.0 * (double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size))); break; case DOUBLE: printf("Cycles per cacheline:\t%f\n", (8.0 * (double) threads_data[0].cycles / (double) (iter * threads_data[0].data.size))); break; } printf(HLINE); #ifdef PERFMON likwid_markerClose(); #endif return EXIT_SUCCESS; }
likwid_markerclose_(void) { likwid_markerClose(); }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.8 $\n"); printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef PERFMON printf("Using likwid\n"); likwid_markerInit(); #endif #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),threadGetProcessorId()); } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #pragma omp parallel { START_PERFMON("copy") #pragma omp for for (j=0; j<N; j++) c[j] = a[j]; STOP_PERFMON("copy") } times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #pragma omp parallel { START_PERFMON("scale") #pragma omp for for (j=0; j<N; j++) b[j] = scalar*c[j]; STOP_PERFMON("scale") } times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #pragma omp parallel { START_PERFMON("add") #pragma omp for for (j=0; j<N; j++) c[j] = a[j]+b[j]; STOP_PERFMON("add") } times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #pragma omp parallel { START_PERFMON("triad") #pragma omp for for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; STOP_PERFMON("triad") } times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); #ifdef PERFMON likwid_markerClose(); #endif return 0; }