int main() { vranlc(x-1); /* vranlc(x); // wrong translation!! */ assert (x[0]==0.5); return 0; }
static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c Fill in array u0 with initial conditions from c random number generator c-------------------------------------------------------------------*/ int k; double x0, start, an, dummy; static double tmp[NX*2*MAXDIM+1]; int i,j,t; start = SEED; /*-------------------------------------------------------------------- c Jump to the starting element for our first plane. c-------------------------------------------------------------------*/ ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an); dummy = randlc(&start, an); ipow46(A, 2*NX*NY, &an); /*-------------------------------------------------------------------- c Go through by z planes filling in one square at a time. c-------------------------------------------------------------------*/ for (k = 0; k < dims[0][2]; k++) { x0 = start; vranlc(2*NX*dims[0][1], &x0, A, tmp); t = 1; for (j = 0; j < dims[0][1]; j++) for (i = 0; i < NX; i++) { u0[k][j][i].real = tmp[t++]; u0[k][j][i].imag = tmp[t++]; } if (k != dims[0][2]) dummy = randlc(&start, an); } }
static void zran3(double *z, int n1, int n2, int n3, int nx, int ny, int k) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c zran3 loads +1 at ten randomly chosen points, c loads -1 at a different ten random points, c and zero elsewhere. c-------------------------------------------------------------------*/ #define MM 10 #define A pow(5.0,13) #define X 314159265.e0 int i0, m0, m1; int i1, i2, i3, d1, e1, e2, e3; double xx, x0, x1, a1, a2, ai; double ten[MM][2], best; int i, j1[MM][2], j2[MM][2], j3[MM][2]; int jg[4][MM][2]; double rdummy; a1 = power( A, nx ); a2 = power( A, nx*ny ); #if 0 #pragma omp parallel { zero3(z,n1,n2,n3); } #else #pragma omp parallel for private(i2, i1) for (i3 = 0;i3 < n3; i3++) { for (i2 = 0; i2 < n2; i2++) { for (i1 = 0; i1 < n1; i1++) { int i123 = i1 + n1*(i2 + n2*i3); z[i123] = 0.0; } } } #endif i = is1-1+nx*(is2-1+ny*(is3-1)); ai = power( A, i ); d1 = ie1 - is1 + 1; e1 = ie1 - is1 + 2; e2 = ie2 - is2 + 2; e3 = ie3 - is3 + 2; x0 = X; rdummy = randlc( &x0, ai ); for (i3 = 1; i3 < e3; i3++) { x1 = x0; for (i2 = 1; i2 < e2; i2++) { xx = x1; vranlc( d1, &xx, A, &(z[0+n1*(i2 + n2*i3)])); rdummy = randlc( &x1, a1 ); } rdummy = randlc( &x0, a2 ); } /*-------------------------------------------------------------------- c call comm3(z,n1,n2,n3) c call showall(z,n1,n2,n3) c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c each processor looks for twenty candidates c-------------------------------------------------------------------*/ for (i = 0; i < MM; i++) { ten[i][1] = 0.0; j1[i][1] = 0; j2[i][1] = 0; j3[i][1] = 0; ten[i][0] = 1.0; j1[i][0] = 0; j2[i][0] = 0; j3[i][0] = 0; } for (i3 = 1; i3 < n3-1; i3++) { for (i2 = 1; i2 < n2-1; i2++) { for (i1 = 1; i1 < n1-1; i1++) { int i123 = i1 + n1*(i2 + n2*i3); if ( z[i123] > ten[0][1] ) { ten[0][1] = z[i123]; j1[0][1] = i1; j2[0][1] = i2; j3[0][1] = i3; bubble( ten, j1, j2, j3, MM, 1 ); } if ( z[i123] < ten[0][0] ) { ten[0][0] = z[i123]; j1[0][0] = i1; j2[0][0] = i2; j3[0][0] = i3; bubble( ten, j1, j2, j3, MM, 0 ); } } } } /*-------------------------------------------------------------------- c Now which of these are globally best? c-------------------------------------------------------------------*/ i1 = MM - 1; i0 = MM - 1; for (i = MM - 1 ; i >= 0; i--) { int j123 = j1[i1][1] + n1*(j2[i1][1] + n2*j3[i1][1]); best = z[j123]; if (best == z[j123]) { jg[0][i][1] = 0; jg[1][i][1] = is1 - 1 + j1[i1][1]; jg[2][i][1] = is2 - 1 + j2[i1][1]; jg[3][i][1] = is3 - 1 + j3[i1][1]; i1 = i1-1; } else { jg[0][i][1] = 0; jg[1][i][1] = 0; jg[2][i][1] = 0; jg[3][i][1] = 0; } ten[i][1] = best; j123 = j1[i0][0] + n1*(j2[i0][0] + n2*j3[i0][0]); best = z[j123]; if (best == z[j123]) { jg[0][i][0] = 0; jg[1][i][0] = is1 - 1 + j1[i0][0]; jg[2][i][0] = is2 - 1 + j2[i0][0]; jg[3][i][0] = is3 - 1 + j3[i0][0]; i0 = i0-1; } else { jg[0][i][0] = 0; jg[1][i][0] = 0; jg[2][i][0] = 0; jg[3][i][0] = 0; } ten[i][0] = best; } m1 = i1+1; m0 = i0+1; /* printf(" negative charges at"); for (i = 0; i < MM; i++) { if (i%5 == 0) printf("\n"); printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]); } printf("\n positive charges at"); for (i = 0; i < MM; i++) { if (i%5 == 0) printf("\n"); printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]); } printf("\n small random numbers were\n"); for (i = MM-1; i >= 0; i--) { printf(" %15.8e", ten[i][0]); } printf("\n and they were found on processor number\n"); for (i = MM-1; i >= 0; i--) { printf(" %4d", jg[0][i][0]); } printf("\n large random numbers were\n"); for (i = MM-1; i >= 0; i--) { printf(" %15.8e", ten[i][1]); } printf("\n and they were found on processor number\n"); for (i = MM-1; i >= 0; i--) { printf(" %4d", jg[0][i][1]); } printf("\n");*/ #if 0 #pragma omp parallel for private(i2, i1) for (i3 = 0; i3 < n3; i3++) { for (i2 = 0; i2 < n2; i2++) { for (i1 = 0; i1 < n1; i1++) { int i123 = i1 + n1*(i2+n2*i3); z[i123] = 0.0; } } } #else #pragma omp parallel { zero3(z,n1,n2,n3); } #endif #pragma acc parallel present(z[0:n3*n2*n1]) copyin(jg) { #pragma acc loop for (i = MM-1; i >= m0; i--) { int j123 = j1[i][0] + n1*(j2[i][0] + n2*j3[i][0]); z[j123] = -1.0; } #pragma acc loop for (i = MM-1; i >= m1; i--) { int j123 = j1[i][1] + n1*(j2[i][1] + n2*j3[i][1]); z[j123] = 1.0; } } // end acc parallel #pragma omp parallel comm3(z,n1,n2,n3,k); /*-------------------------------------------------------------------- c call showall(z,n1,n2,n3) c-------------------------------------------------------------------*/ }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; int nthreads = 1; boolean verified; char size[13+1]; /* character*13 */ /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 3.0 structured OpenMP C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); #pragma omp parallel for default(shared) private(i) for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for ( i = 0; i <= NQ - 1; i++) { q[i] = 0.0; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; #pragma omp parallel copyin(x) { double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double qq[NQ]; /* private copy of q[0:NQ-1] */ for (i = 0; i < NQ; i++) qq[i] = 0.0; #pragma omp for reduction(+:sx,sy) schedule(static) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ if (TIMERS_ENABLED == TRUE) timer_start(3); vranlc(2*NK, &t1, A, x-1); if (TIMERS_ENABLED == TRUE) timer_stop(3); /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ if (TIMERS_ENABLED == TRUE) timer_start(2); for ( i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[l] += 1.0; /* counts */ sx = sx + t3; /* sum of Xi */ sy = sy + t4; /* sum of Yi */ } } if (TIMERS_ENABLED == TRUE) timer_stop(2); } #pragma omp critical { for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; } #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end of parallel region */ for (i = 0; i <= NQ-1; i++) { gc = gc + q[i]; } timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (TIMERS_ENABLED == TRUE) { printf("Total time: %f", timer_read(1)); printf("Gaussian pairs: %f", timer_read(2)); printf("Random numbers: %f", timer_read(3)); } }
int main() { double Mops, t1, t2, t3, t4, x1, x2; double sx, sy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int np; int i, ik, kk, l, k, nit; int k_offset, j; logical verified, timers_enabled; double dum[3] = {1.0, 1.0, 1.0}; char size[16]; FILE *fp; if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; //-------------------------------------------------------------------- // Call the random number generator functions and initialize // the x-array to reduce the effects of paging on the timings. // Also, call all mathematical functions that are used. Make // sure these initializations cannot be eliminated as dead code. //-------------------------------------------------------------------- vranlc(0, &dum[0], dum[1], &dum[2]); dum[0] = randlc(&dum[1], dum[2]); for (i = 0; i < 2 * NK; i++) { x[i] = -1.0e99; } Mops = log(sqrt(fabs(MAX(1.0, 1.0)))); timer_clear(0); timer_clear(1); timer_clear(2); timer_start(0); t1 = A; vranlc(0, &t1, A, x); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; // Find starting seed t1 for this kk. for (i = 1; i <= 100; i++) { ik = kk / 2; if ((2 * ik) != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } //-------------------------------------------------------------------- // Compute uniform pseudorandom numbers. //-------------------------------------------------------------------- if (timers_enabled) timer_start(2); vranlc(2 * NK, &t1, A, x); if (timers_enabled) timer_stop(2); //-------------------------------------------------------------------- // Compute Gaussian deviates by acceptance-rejection method and // tally counts in concentri//square annuli. This loop is not // vectorizable. //-------------------------------------------------------------------- if (timers_enabled) timer_start(1); for (i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = x1 * x1 + x2 * x2; if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); t4 = (x2 * t2); l = MAX(fabs(t3), fabs(t4)); q[l] = q[l] + 1.0; sx = sx + t3; sy = sy + t4; } } if (timers_enabled) timer_stop(1); } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((sx - sx_verify_value) / sx_verify_value); sy_err = fabs((sy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", sx, sy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); tt = timer_read(1); printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); tt = timer_read(2); printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } return 0; }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double *x, **xx, *q, **qq; double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; const int TRANSFER_X = 1; int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; double loc_x,loc_t1,loc_t2,loc_t3,loc_t4; double loc_a1,loc_a2,loc_x1,loc_x2,loc_z; boolean verified; char size[13+1]; /* character*13 */ /* Allocate working memory */ x = (double*) malloc(sizeof(double) * 2*NK); xx = (double**) malloc(sizeof(double*) * NN); xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK); for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK); q = (double*) malloc(sizeof(double) * NQ); qq = (double**) malloc(sizeof(double*) * NN); qq[0] = (double*) malloc(sizeof(double) * NN * NQ); for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ; /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ #pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \ copyout(q[0:NQ]) { vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); #pragma acc update device(x[0:2*NK]) /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; #pragma acc parallel loop for (k = 0; k < np; k++) { /* Initialize private q (qq) */ #pragma acc loop for (i = 0; i < NQ; i++) qq[k][i] = 0.0; /* Initialize private x (xx) */ #pragma acc loop for (i = 0; i < 2*NK; i++) xx[k][i] = x[i]; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double psx, psy; #pragma acc parallel loop reduction(+:sx,sy) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ #pragma acc loop seq for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = RANDLC(&t1, t2); if (ik == 0) break; t3 = RANDLC(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ loc_t1 = r23 * A; loc_a1 = (int)loc_t1; loc_a2 = A - t23 * loc_a1; loc_x = t1; #pragma acc loop seq for (i = 1; i <= 2*NK; i++) { loc_t1 = r23 * loc_x; loc_x1 = (int)loc_t1; loc_x2 = loc_x - t23 * loc_x1; loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1; loc_t2 = (int)(r23 * loc_t1); loc_z = loc_t1 - t23 * loc_t2; loc_t3 = t23 * loc_z + loc_a2 * loc_x2; loc_t4 = (int)(r46 * loc_t3); loc_x = loc_t3 - t46 * loc_t4; xx[k-1][i-1] = r46 * loc_x; } t1 = loc_x; /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ psx = psy = 0.0; #pragma acc loop reduction(+:psx,psy) for ( i = 0; i < NK; i++) { x1 = 2.0 * xx[k-1][2*i] - 1.0; x2 = 2.0 * xx[k-1][2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[k-1][l] += 1.0; /* counts */ psx = psx + t3; /* sum of Xi */ psy = psy + t4; /* sum of Yi */ } } sx += psx; sy += psy; } /* Reduce private qq to q */ #pragma acc parallel loop reduction(+:gc) for ( i = 0; i < NQ; i++ ) { double sumq = 0.0; #pragma acc loop reduction(+:sumq) for (k = 0; k < np; k++) sumq = sumq + qq[k][i]; q[i] = sumq; gc += sumq; } } /* end acc data */ timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); return 0; }
static void __ompc_func_3 (void **__ompc_args) { auto double *_pp_sx; auto double *_pp_sy; auto int *_pp_np; auto int *_pp_k_offset; auto double *_pp_an; auto int *_pp_nthreads; auto double *_ppthd_x; (_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x)))); (_pp_sx) = (((double *) (*__ompc_args))); (_pp_sy) = (((double *) (*((__ompc_args) + (1))))); (_pp_np) = (((int *) (*((__ompc_args) + (2))))); (_pp_k_offset) = (((int *) (*((__ompc_args) + (3))))); (_pp_an) = (((double *) (*((__ompc_args) + (4))))); (_pp_nthreads) = (((int *) (*((__ompc_args) + (5))))); _ompc_copyin_thdprv (_ppthd_x, x, 1048576); { auto double t1; auto double t2; auto double t3; auto double t4; auto double x1; auto double x2; auto int kk; auto int i; auto int ik; auto int l; auto double qq[10]; # 150 "ep.c" for ((i) = (0); (i) < (10); (i)++) { (*((qq) + (i))) = (0.0); } { auto double _p_sx; auto double _p_sy; auto int _p_k; auto int _p_k_0; auto int _p_k_1; auto int _p_k_2; (_p_sy) = (0.0); (_p_sx) = (0.0); (_p_k_0) = (1); (_p_k_1) = ((*_pp_np) + (1)); (_p_k_2) = (1); _ompc_static_bsched (&_p_k_0, &_p_k_1, &_p_k_2); # 153 "ep.c" for ((_p_k) = (_p_k_0); (_p_k) < (_p_k_1); (_p_k) += (_p_k_2)) { # 154 "ep.c" (kk) = ((*_pp_k_offset) + (_p_k)); # 155 "ep.c" (t1) = (2.71828183E8); # 156 "ep.c" (t2) = (*_pp_an); # 160 "ep.c" for ((i) = (1); (i) <= (100); (i)++) { # 161 "ep.c" (ik) = ((kk) / (2)); # 162 "ep.c" if (((2) * (ik)) != (kk)) { (t3) = (randlc (&(t1), t2)); } # 163 "ep.c" if ((ik) == (0)) # 163 "ep.c" break; # 164 "ep.c" (t3) = (randlc (&(t2), t2)); # 165 "ep.c" (kk) = (ik); } # 170 "ep.c" if ((0) == (1)) { timer_start (3); } # 171 "ep.c" vranlc ((2) * ((1) << (16)), &(t1), 1.220703125E9, (_ppthd_x) - (1)); # 172 "ep.c" if ((0) == (1)) { timer_stop (3); } # 179 "ep.c" if ((0) == (1)) { timer_start (2); } # 181 "ep.c" for ((i) = (0); (i) < ((1) << (16)); (i)++) { # 182 "ep.c" (x1) = (((2.0) * (*((_ppthd_x) + ((2) * (i))))) - (1.0)); # 183 "ep.c" (x2) = (((2.0) * (*((_ppthd_x) + (((2) * (i)) + (1))))) - (1.0)); # 184 "ep.c" (t1) = (((x1) * (x1)) + ((x2) * (x2))); # 185 "ep.c" if ((t1) <= (1.0)) { # 186 "ep.c" (t2) = (sqrt (((-(2.0)) * (log (t1))) / (t1))); # 187 "ep.c" (t3) = ((x1) * (t2)); # 188 "ep.c" (t4) = ((x2) * (t2)); # 189 "ep.c" (l) = (((int) (((fabs (t3)) > (fabs (t4))) ? (fabs (t3)) : (fabs (t4))))); # 190 "ep.c" (*((qq) + (l))) += (1.0); # 191 "ep.c" (_p_sx) = ((_p_sx) + (t3)); # 192 "ep.c" (_p_sy) = ((_p_sy) + (t4)); } } # 195 "ep.c" if ((0) == (1)) { timer_stop (2); } } _ompc_reduction (&_p_sy, _pp_sy, 14, 6); _ompc_reduction (&_p_sx, _pp_sx, 14, 6); _ompc_barrier (); } { _ompc_enter_critical (&__ompc_lock_critical); # 199 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { (*((q) + (i))) += (*((qq) + (i))); } _ompc_exit_critical (&__ompc_lock_critical); } if (_ompc_is_master ()) { (*_pp_nthreads) = (omp_get_num_threads ()); } } }
int main (int argc, char **argv) { //auto double *_ppthd_x; auto double Mops; auto double t1; auto double t2; auto double t3; auto double t4; auto double x1; auto double x2; auto double sx; auto double sy; auto double tm; auto double an; auto double tt; auto double gc; auto double dum[3]; auto int np; auto int ierr; auto int node; auto int no_nodes; auto int i; auto int ik; auto int kk; auto int l; auto int k; auto int nit; auto int ierrcode; auto int no_large_nodes; auto int np_add; auto int k_offset; auto int j; auto int nthreads; auto int verified; auto char size[14]; int status = 0; _ompc_init(argc,argv); //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x)))); (*(dum)) = (1.0); (*((dum) + (1))) = (1.0); (*((dum) + (2))) = (1.0); (nthreads) = (1); # 84 "ep.c" printf ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012"); # 86 "ep.c" sprintf (size, "%12.0f", pow (2.0, (28) + (1))); # 87 "ep.c" for ((j) = (13); (j) >= (1); (j)--) { # 88 "ep.c" if ((((int) (*((size) + (j))))) == (46)) { (*((size) + (j))) = (((char) (32))); } } # 90 "ep.c" printf (" Number of random numbers generated: %13s\012", size); # 92 "ep.c" (verified) = (0); # 99 "ep.c" (np) = ((1) << ((28) - (16))); # 107 "ep.c" vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2)); # 108 "ep.c" (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2)))); # 109 "ep.c" for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++) { x[i] = (-(1.0E99)); //(*((_ppthd_x) + (i))) = (-(1.0E99)); } # 110 "ep.c" (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0))))); # 112 "ep.c" timer_clear (1); # 113 "ep.c" timer_clear (2); # 114 "ep.c" timer_clear (3); # 115 "ep.c" timer_start (1); # 117 "ep.c" vranlc (0, &(t1), 1.220703125E9, x); //vranlc (0, &(t1), 1.220703125E9, _ppthd_x); # 121 "ep.c" (t1) = (1.220703125E9); # 123 "ep.c" for ((i) = (1); (i) <= ((16) + (1)); (i)++) { # 124 "ep.c" (t2) = (randlc (&(t1), t1)); } # 127 "ep.c" (an) = (t1); # 128 "ep.c" (tt) = (2.71828183E8); # 129 "ep.c" (gc) = (0.0); # 130 "ep.c" (sx) = (0.0); # 131 "ep.c" (sy) = (0.0); # 133 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 134 "ep.c" (*((q) + (i))) = (0.0); } # 142 "ep.c" (k_offset) = (-(1)); { auto void *__ompc_argv[6]; (*(__ompc_argv)) = (((void *) (&sx))); (*((__ompc_argv) + (1))) = (((void *) (&sy))); (*((__ompc_argv) + (2))) = (((void *) (&np))); (*((__ompc_argv) + (3))) = (((void *) (&k_offset))); (*((__ompc_argv) + (4))) = (((void *) (&an))); (*((__ompc_argv) + (5))) = (((void *) (&nthreads))); _ompc_do_parallel (__ompc_func_3, __ompc_argv); } # 207 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 208 "ep.c" (gc) = ((gc) + (*((q) + (i)))); } # 211 "ep.c" timer_stop (1); # 212 "ep.c" (tm) = (timer_read (1)); # 214 "ep.c" (nit) = (0); # 215 "ep.c" if ((28) == (24)) { # 216 "ep.c" if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8))) { # 218 "ep.c" (verified) = (1); } } else # 220 "ep.c" if ((28) == (25)) { # 221 "ep.c" if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8))) { # 223 "ep.c" (verified) = (1); } } else # 225 "ep.c" if ((28) == (28)) { # 226 "ep.c" if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8))) { # 228 "ep.c" (verified) = (1); printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy); } } else # 230 "ep.c" if ((28) == (30)) { # 231 "ep.c" if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8))) { # 233 "ep.c" (verified) = (1); } } else # 235 "ep.c" if ((28) == (32)) { # 236 "ep.c" if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8))) { # 238 "ep.c" (verified) = (1); } } # 242 "ep.c" (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0)); # 244 "ep.c" printf ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012", tm, 28, gc, sx, sy); # 251 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 252 "ep.c" printf ("%3d %15.0f\012", i, *((q) + (i))); } # 255 "ep.c" c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, "2.3", "07 Aug 2006", "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm", "randdp"); # 261 "ep.c" if ((0) == (1)) { # 262 "ep.c" printf ("Total time: %f", timer_read (1)); # 263 "ep.c" printf ("Gaussian pairs: %f", timer_read (2)); # 264 "ep.c" printf ("Random numbers: %f", timer_read (3)); } }