int main(int argc, char *argv[]) { volatile uint64_t workcnt = 0; int nthreads; debug_printf("bomptest started.\n"); bench_init(); #if CONFIG_TRACE errval_t err = trace_control(TRACE_EVENT(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_START, 0), TRACE_EVENT(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_STOP, 0), 0); assert(err_is_ok(err)); #endif if(argc == 2) { nthreads = atoi(argv[1]); backend_span_domain(nthreads, STACK_SIZE); bomp_custom_init(NULL); omp_set_num_threads(nthreads); } else { assert(!"Specify number of threads"); } trace_event(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_START, 0); uint64_t start = bench_tsc(); #pragma omp parallel while(rdtsc() < start + 805000000ULL) { workcnt++; } uint64_t end = bench_tsc(); trace_event(TRACE_SUBSYS_ROUTE, TRACE_EVENT_ROUTE_BENCH_STOP, 0); printf("done. time taken: %" PRIu64 " cycles.\n", end - start); #if CONFIG_TRACE char *buf = malloc(4096*4096); trace_dump(buf, 4096*4096, NULL); printf("%s\n", buf); #endif for(;;); return 0; }
int main(int argc, char *argv[]) { uint64_t begin, end; int i; static int a[N]; #ifndef POSIX bomp_custom_init(); #endif assert(argc == 2); omp_set_num_threads(atoi(argv[1])); for (i=0;i<N;i++) a[i]= 2*i; begin = rdtsc(); #pragma omp parallel for for (i=0;i<N;i++) a[i]= 2*i; end = rdtsc(); printf("Value of sum is %d, time taken %lu\n", 0, end - begin); }
int main(int argc, char *argv[]) { int nthreads = omp_get_max_threads(); if(argc == 2) { nthreads = atoi(argv[1]); backend_span_domain(nthreads, STACK_SIZE); bomp_custom_init(NULL); omp_set_num_threads(nthreads); } printf("threads %d, CPUs %d\n", nthreads, omp_get_num_procs()); volatile uint64_t exittime[ITERATIONS] = { 0 }; // Do some work #pragma omp parallel { #ifdef GANG_SCHEDULING bomp_synchronize(); #endif for(int i = 0; i < ITERATIONS; i++) { uint64_t start = rdtsc(); uint64_t workcn = 0; for(uint64_t n = 0;; n++) { #pragma omp barrier workcn++; if(omp_get_thread_num() == 0 && exittime[i] == 0 && rdtsc() >= start + PERIOD) { exittime[i] = n + 3; } if(exittime[i] != 0 && exittime[i] == n) { n++; break; } } /* char buf[64]; */ /* sprintf(buf, "%d: %lu(%lu)\n", omp_get_thread_num(), workcn, */ /* stuck[omp_get_thread_num()]); */ /* sys_print(buf, strlen(buf)); */ /* stuck[omp_get_thread_num()] = 0; */ workcnt[omp_get_thread_num()][i] = workcn; } } char buf[64]; for(int i = 0; i < ITERATIONS; i++) { for(int n = 0; n < nthreads; n++) { sprintf(buf, "%lu ", workcnt[n][i]); sys_print(buf, strlen(buf)); } sys_print("\n", 1); } /* sys_print("\n", 1); */ /* char buf[128], buf1[128]; */ /* sprintf(buf, "iterations in %lu ticks: ", PERIOD); */ /* for(int i = 0; i < nthreads; i++) { */ /* sprintf(buf1, "%lu ", workcnt[i]); */ /* strcat(buf, buf1); */ /* } */ /* sprintf(buf1, "\n"); */ /* strcat(buf, buf1); */ /* sys_print(buf, strlen(buf)); */ /* } */ for(;;); return 0; }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; int nthreads = 1; boolean verified; char size[13+1]; /* character*13 */ /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ #ifndef POSIX #ifndef NOBOMP bomp_custom_init(); #endif #endif omp_set_num_threads(1); printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); for (i = 0; i < 2*NK; i++) { x[i] = -1.0e99; } printf("Reached here "); Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for ( i = 0; i <= NQ - 1; i++) { q[i] = 0.0; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; #pragma omp parallel copyin(x) { double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double qq[NQ]; /* private copy of q[0:NQ-1] */ for (i = 0; i < NQ; i++) qq[i] = 0.0; #pragma omp for reduction(+:sx,sy) schedule(static) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ if (TIMERS_ENABLED == TRUE) timer_start(3); vranlc(2*NK, &t1, A, x-1); if (TIMERS_ENABLED == TRUE) timer_stop(3); /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ if (TIMERS_ENABLED == TRUE) timer_start(2); for ( i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[l] += 1.0; /* counts */ sx = sx + t3; /* sum of Xi */ sy = sy + t4; /* sum of Yi */ } } if (TIMERS_ENABLED == TRUE) timer_stop(2); } #pragma omp critical { for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; } #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end of parallel region */ for (i = 0; i <= NQ-1; i++) { gc = gc + q[i]; } timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (TIMERS_ENABLED == TRUE) { printf("Total time: %f", timer_read(1)); printf("Gaussian pairs: %f", timer_read(2)); printf("Random numbers: %f", timer_read(3)); } }