main() { double alpha = 3.14; /* Initialize */ for (int i=0; i<SIZE; i++) { a[i] = 1.0/(double) i; b[i] = 1.0; c[i] = (double) i; } LIKWID_MARKER_INIT; #pragma omp parallel { LIKWID_MARKER_THREADINIT; LIKWID_MARKER_START("time"); sleep(2); LIKWID_MARKER_STOP("time"); int threadId = omp_get_thread_num(); /****************************************************/ #pragma omp for for (int j = 0; j < 10; j++) { LIKWID_MARKER_START("plain"); for (int k = 0; k < (threadId+1); k++) { for (int i = 0; i < SIZE; i++) { a[i] = b[i] + alpha * c[i]; sum += a[i]; } } LIKWID_MARKER_STOP("plain"); } printf("Flops performed plain: %g\n",(double)10*SIZE*3); /****************************************************/ } LIKWID_MARKER_CLOSE; printf( "OK, dofp result = %e\n", sum); }
int main(int argc, char* argv[]) { int i, g; int nevents = 10; double events[10]; double time; int count; // Init Marker API in serial region once in the beginning LIKWID_MARKER_INIT; #pragma omp parallel { // Each thread must add itself to the Marker API, therefore must be // in parallel region LIKWID_MARKER_THREADINIT; // Optional. Register region name LIKWID_MARKER_REGISTER("example"); } // perfmon_getNumberOfGroups is not part of the MarkerAPI, // it belongs to the normal LIKWID API. But the MarkerAPI // has no function to get the number of configured groups. for (g=0;g < perfmon_getNumberOfGroups(); g++) { #pragma omp parallel { printf("Thread %d sleeps now for %d seconds\n", omp_get_thread_num(), SLEEPTIME); // Start measurements inside a parallel region LIKWID_MARKER_START("example"); // Insert your code here. // Often contains an OpenMP for pragma. Regions can be nested. sleep(SLEEPTIME); // Stop measurements inside a parallel region LIKWID_MARKER_STOP("example"); printf("Thread %d wakes up again\n", omp_get_thread_num()); // If you need the performance data inside your application, use LIKWID_MARKER_GET("example", &nevents, events, &time, &count); // where events is an array of doubles with nevents entries, // time is a double* and count an int*. printf("Region example measures %d events, total measurement time is %f\n", nevents, time); printf("The region was called %d times\n", count); for (i = 0; i < nevents; i++) { printf("Event %d: %f\n", i, events[i]); } // If multiple groups given, you can switch to the next group LIKWID_MARKER_SWITCH; } } // Close Marker API and write results to file for further evaluation done // by likwid-perfctr LIKWID_MARKER_CLOSE; return 0; }
int main(int argc, char* argv[]) { int i, j ; double alpha = 3.14; /* Initialize */ for (i=0; i<SIZE; i++) { a[i] = 1.0/(double) i; b[i] = 1.0; c[i] = (double) i; } LIKWID_MARKER_INIT; // likwid_pinProcess(2); printf("Main running on core %d\n", likwid_getProcessorId()); /****************************************************/ #pragma omp parallel { LIKWID_MARKER_THREADINIT; char* label = malloc(40*sizeof(char)); int threadId = omp_get_thread_num(); // likwid_pinThread(threadId); printf("Thread running on core %d\n", likwid_getProcessorId()); for (int counter=1; counter< 3; counter++) { sprintf(label,"plain-%d",counter); #pragma omp barrier LIKWID_MARKER_START(label); for (j = 0; j < counter * threadId; j++) { for (i = 0; i < SIZE; i++) { a[i] = b[i] + alpha * c[i]; sum += a[i]; } } #pragma omp barrier LIKWID_MARKER_STOP(label); printf("Flops performed thread %d region %s: %g\n",threadId, label,(double)counter*threadId*SIZE*3); } free(label); } /****************************************************/ LIKWID_MARKER_CLOSE; printf( "OK, dofp result = %e\n", sum); }
int main(){ int i, k; int nworkers, totalworkers; char cpuCount[20]; double *a, *b, *c, *d; double sums[2000]; cpu_set_t cpuset; TimeData timer; double triad_time, copy_time, total = 0; nprocessors = sysconf(_SC_NPROCESSORS_CONF); nworkers = cilk_spawn get_nworkers(); totalworkers = cilk_spawn get_totalworkers(); for (i=0;i<nworkers;i++) { sums[i] = 0; } LIKWID_MARKER_INIT; cilk_spawn allocate_vector(&a, SIZE); cilk_spawn allocate_vector(&b, SIZE); cilk_spawn allocate_vector(&c, SIZE); cilk_spawn allocate_vector(&d, SIZE); cilk_sync; for (i=0; i<SIZE; i++) { a[i] = 1.0; b[i] = 2.0; c[i] = 0.0; d[i] = 1.0; } time_start(&timer); for (k=0; k<ITER; k++) { for (i=0;i<nworkers;i++) { cilk_spawn LIKWID_MARKER_START("copy"); } cilk_sync; cilk_for(i=0;i<SIZE;i++) { c[i] = a[i]; } for (i=0;i<nworkers;i++) { cilk_spawn LIKWID_MARKER_STOP("copy"); } cilk_sync; } time_stop(&timer); copy_time = time_print(&timer)/(double)ITER; time_start(&timer); for (k=0; k<ITER; k++) { for (i=0;i<nworkers;i++) { cilk_spawn LIKWID_MARKER_START("triad"); } cilk_sync; cilk_for(i=0;i<SIZE;i++) { a[i] = b[i] + c[i] * d[i]; } for (i=0;i<nworkers;i++) { cilk_spawn LIKWID_MARKER_STOP("triad"); } cilk_sync; } time_stop(&timer); triad_time = time_print(&timer)/(double)ITER; printf("Processed %.1f Mbyte at copy benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(double)), copy_time, 1E-6*((2*SIZE*sizeof(double))/copy_time)); printf("Processed %.1f Mbyte at triad benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(4*SIZE*sizeof(double)), triad_time, 1E-6*((4*SIZE*sizeof(double))/triad_time)); printf("Main PID %d\n",getpid()); for (i=0;i<nworkers;i++) { cilk_spawn show_thread(); } cilk_sync; LIKWID_MARKER_CLOSE; }
int main(int argn, char** argc) { int err, i ,j; int numCPUs = 0; int gid; DATATYPE *a,*b,*c,*d; TimeData timer; double triad_time, copy_time, scale_time, stream_time; char estr[1024]; double result, scalar = 3.0; char* ptr; if (argn != 3) { printf("Usage: %s <cpustr> <events>\n", argc[0]); return 1; } strcpy(estr, argc[2]); allocate_vector(&a, SIZE); allocate_vector(&b, SIZE); allocate_vector(&c, SIZE); allocate_vector(&d, SIZE); err = topology_init(); if (err < 0) { printf("Failed to initialize LIKWID's topology module\n"); return 1; } CpuTopology_t topo = get_cpuTopology(); affinity_init(); int* cpus = (int*)malloc(topo->numHWThreads * sizeof(int)); if (!cpus) return 1; numCPUs = cpustr_to_cpulist(argc[1], cpus, topo->numHWThreads); omp_set_num_threads(numCPUs); err = perfmon_init(numCPUs, cpus); if (err < 0) { printf("Failed to initialize LIKWID's performance monitoring module\n"); affinity_finalize(); topology_finalize(); return 1; } gid = perfmon_addEventSet(estr); if (gid < 0) { printf("Failed to add event string %s to LIKWID's performance monitoring module\n", estr); perfmon_finalize(); affinity_finalize(); topology_finalize(); return 1; } err = perfmon_setupCounters(gid); if (err < 0) { printf("Failed to setup group %d in LIKWID's performance monitoring module\n", gid); perfmon_finalize(); affinity_finalize(); topology_finalize(); return 1; } #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { printf ("Number of Threads requested = %i\n",omp_get_num_threads()); } likwid_pinThread(cpus[omp_get_thread_num()]); printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),sched_getcpu()); } #endif #pragma omp parallel for for (int j=0; j<SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; d[j] = 1.0; } err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("copy"); #pragma omp for for (int j=0; j<SIZE; j++) { c[j] = a[j]; } LIKWID_MARKER_STOP("copy"); } } time_stop(&timer); err = perfmon_stopCounters(); copy_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at copy benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(DATATYPE)), copy_time, 1E-6*((2*SIZE*sizeof(DATATYPE))/copy_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } strcpy(estr, argc[2]); perfmon_setupCounters(gid); err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("scale"); #pragma omp for for (int j=0; j<SIZE; j++) { b[j] = scalar*c[j]; } LIKWID_MARKER_STOP("scale"); } } time_stop(&timer); err = perfmon_stopCounters(); scale_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at scale benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(DATATYPE)), copy_time, 1E-6*((2*SIZE*sizeof(DATATYPE))/copy_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } strcpy(estr, argc[2]); perfmon_setupCounters(gid); err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("stream"); #pragma omp for for (int j=0; j<SIZE; j++) { c[j] = a[j] + b[j]; } LIKWID_MARKER_STOP("stream"); } } time_stop(&timer); err = perfmon_stopCounters(); stream_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at stream benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(2*SIZE*sizeof(DATATYPE)), copy_time, 1E-6*((2*SIZE*sizeof(DATATYPE))/copy_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } strcpy(estr, argc[2]); perfmon_setupCounters(gid); err = perfmon_startCounters(); if (err < 0) { printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } time_start(&timer); #pragma omp parallel { for (int k=0; k<ITER; k++) { LIKWID_MARKER_START("triad"); #pragma omp for for (int j=0; j<SIZE; j++) { a[j] = b[j] + c[j] * scalar; } LIKWID_MARKER_STOP("triad"); } } time_stop(&timer); err = perfmon_stopCounters(); triad_time = time_print(&timer)/(double)ITER; if (err < 0) { printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1); perfmon_finalize(); topology_finalize(); return 1; } printf("Processed %.1f Mbyte at triad benchmark in %.4f seconds: %.2f MByte/s\n", 1E-6*(4*SIZE*sizeof(DATATYPE)), triad_time, 1E-6*((4*SIZE*sizeof(DATATYPE))/triad_time)); ptr = strtok(estr,","); j = 0; while (ptr != NULL) { for (i = 0;i < numCPUs; i++) { result = perfmon_getResult(gid, j, cpus[i]); printf("Measurement result for event set %s at CPU %d: %f\n", ptr, cpus[i], result); } ptr = strtok(NULL,","); j++; } perfmon_finalize(); affinity_finalize(); topology_finalize(); return 0; }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.8 $\n"); printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef LIKWID_PERFMON printf("Using likwid\n"); #endif LIKWID_MARKER_INIT; #ifdef _OPENMP printf(HLINE); #pragma omp parallel { LIKWID_MARKER_THREADINIT; #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),threadGetProcessorId()); } #endif LIKWID_MARKER_START("init"); /* Get initial value for system clock. */ //#pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } LIKWID_MARKER_STOP("init"); printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #pragma omp parallel { LIKWID_MARKER_START("copy"); #pragma omp for for (j=0; j<N; j++) c[j] = a[j]; LIKWID_MARKER_STOP("copy"); } times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #pragma omp parallel { LIKWID_MARKER_START("scale"); #pragma omp for for (j=0; j<N; j++) b[j] = scalar*c[j]; LIKWID_MARKER_STOP("scale"); } times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #pragma omp parallel { LIKWID_MARKER_START("add"); #pragma omp for for (j=0; j<N; j++) c[j] = a[j]+b[j]; LIKWID_MARKER_STOP("add"); } times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #pragma omp parallel { LIKWID_MARKER_START("triad"); #pragma omp for for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; LIKWID_MARKER_STOP("triad"); } times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); LIKWID_MARKER_CLOSE; return 0; }
int main(int argc, char * argv[]) { long int t, i, j, k; const int BASE = 1024; // for timekeeping int ts_return = -1; struct timeval start, end, result; double tdiff = 0.0; int T; printf("Please enter number of timesteps = \n"); scanf("%d", &T); printf("Number of points = %ld\t|Number of timesteps = %ld\t", N*N, T); /* Initialization */ srand(42); // seed with a constant value to verify results for (i = 0; i < N+2; i++) { for (j = 0; j < N+2; j++) { A[0][i][j] = 1.0 * (rand() % BASE); } } #ifdef USE_LIKWID #pragma omp parallel { LIKWID_MARKER_START("Compute_omp"); } #endif #pragma acc data create(A[0:2][0:N+2][0:N+2]) { #pragma acc update device(A[0:2][0:N+2][0:N+2]) #ifdef TIME gettimeofday(&start, 0); #endif #pragma scop for (t = 0; t < T; t++) { #pragma acc kernels loop independent present(A[0:2][0:N+2][0:N+2]) for (i = 1; i < N+1; i++) { #pragma acc loop independent for (j = 1; j < N+1; j++) { A[(t+1)%2][i][j] = 0.125 * (A[t%2][i+1][j] - 2.0 * A[t%2][i][j] + A[t%2][i-1][j]) + 0.125 * (A[t%2][i][j+1] - 2.0 * A[t%2][i][j] + A[t%2][i][j-1]) + A[t%2][i][j]; } } } #pragma endscop #ifdef TIME gettimeofday(&end, 0); ts_return = timeval_subtract(&result, &end, &start); tdiff = (double)(result.tv_sec + result.tv_usec * 1.0e-6); printf("|Time taken = %7.5lfms\t", tdiff * 1.0e3); printf("|MFLOPS = %f\t", ((((double)NUM_FP_OPS * N *N * T) / tdiff) / 1000000L)); #endif #pragma acc update host(A[0:2][0:N+2][0:N+2]) } // acc data create #ifdef USE_LIKWID #pragma omp parallel { LIKWID_MARKER_STOP("Compute_omp"); } #endif #ifdef VERIFY for (i = 1; i < N+1; i++) { for (j = 1; j < N+1; j++) { total+= A[T%2][i][j] ; } } printf("|sum: %e\t", total); for (i = 1; i < N+1; i++) { for (j = 1; j < N+1; j++) { sum_err_sqr += (A[T%2][i][j] - (total/N))*(A[T%2][i][j] - (total/N)); } } printf("|rms(A) = %7.2f\t", sqrt(sum_err_sqr)); for (i = 1; i < N+1; i++) { for (j = 1; j < N+1; j++) { chtotal += ((char *)A[T%2][i])[j]; } } printf("|sum(rep(A)) = %d\n", chtotal); #endif return 0; }