void create_seq( int myId ) { double seed = 314159265.00; double a = 1220703125.00; double x, mySeed; INT_TYPE i, k, chunk; int ini, fim; chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS; ini = chunk * myId; fim = ini + chunk; if ( fim > NUM_KEYS ) { fim = NUM_KEYS; } mySeed = find_my_seed( myId, NUM_THREADS, (long)4*NUM_KEYS, seed, a ); k = MAX_KEY/4; for (i = ini; i < fim; i++) { x = randlc(&mySeed, &a); x += randlc(&mySeed, &a); x += randlc(&mySeed, &a); x += randlc(&mySeed, &a); key_array[i] = k*x; } }
double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ int np, /* np = num procs */ long nn, /* total num of ran numbers, all procs */ double s, /* Ran num seed, for ex.: 314159265.00 */ double a ) /* Ran num gen mult, try 1220703125.00 */ { double t1,t2; long mq,nq,kk,ik; if ( kn == 0 ) return s; mq = (nn/4 + np - 1) / np; nq = mq * 4 * kn; /* number of rans to be skipped */ t1 = s; t2 = a; kk = nq; while ( kk > 1 ) { ik = kk / 2; if( 2 * ik == kk ) { (void)randlc( &t2, &t2 ); kk = ik; } else { (void)randlc( &t1, &t2 ); kk = kk - 1; } } (void)randlc( &t1, &t2 ); return( t1 ); }
//--------------------------------------------------------------------- // compute a^exponent mod 2^46 //--------------------------------------------------------------------- static double ipow46(double a, int exponent) { double result, dummy, q, r; int n, n2; //--------------------------------------------------------------------- // Use // a^n = a^(n/2)*a^(n/2) if n even else // a^n = a*a^(n-1) if n odd //--------------------------------------------------------------------- result = 1; if (exponent == 0) return result; q = a; r = 1; n = exponent; while (n > 1) { n2 = n / 2; if (n2 * 2 == n) { dummy = randlc(&q, q); n = n2; } else { dummy = randlc(&r, q); n = n-1; } } dummy = randlc(&r, q); result = r; return result; }
//--------------------------------------------------------------------- // generate a sparse n-vector (v, iv) // having nzv nonzeros // // mark(i) is set to 1 if position i is nonzero. // mark is all zero on entry and is reset to all zero before exit // this corrects a performance bug found by John G. Lewis, caused by // reinitialization of mark on every one of the n calls to sprnvc //--------------------------------------------------------------------- static void sprnvc(int n, int nz, int nn1, double v[], int iv[]) { int nzv, ii, i; double vecelt, vecloc; nzv = 0; while (nzv < nz) { vecelt = randlc(&tran, amult); //--------------------------------------------------------------------- // generate an integer between 1 and n in a portable manner //--------------------------------------------------------------------- vecloc = randlc(&tran, amult); i = icnvrt(vecloc, nn1) + 1; if (i > n) continue; //--------------------------------------------------------------------- // was this integer generated already? //--------------------------------------------------------------------- logical was_gen = false; for (ii = 0; ii < nzv; ii++) { if (iv[ii] == i) { was_gen = true; break; } } if (was_gen) continue; v[nzv] = vecelt; iv[nzv] = i; nzv = nzv + 1; } }
static double power( double a, int n ) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c power raises an integer, disguised as a double c precision real, to an integer power c-------------------------------------------------------------------*/ double aj; int nj; double rdummy; double power; power = 1.0; nj = n; aj = a; while (nj != 0) { if( (nj%2) == 1 ) rdummy = randlc( &power, aj ); rdummy = randlc( &aj, aj ); nj = nj/2; } return (power); }
/*--------------------------------------------------------------------- c generate a sparse n-vector (v, iv) c having nzv nonzeros c c mark(i) is set to 1 if position i is nonzero. c mark is all zero on entry and is reset to all zero before exit c this corrects a performance bug found by John G. Lewis, caused by c reinitialization of mark on every one of the n calls to sprnvc ---------------------------------------------------------------------*/ static void sprnvc( int n, int nz, double v[], /* v[1:*] */ int iv[], /* iv[1:*] */ int nzloc[], /* nzloc[1:n] */ int mark[] ) /* mark[1:n] */ { int nn1; int nzrow, nzv, ii, i; double vecelt, vecloc; nzv = 0; nzrow = 0; nn1 = 1; do { nn1 = 2 * nn1; } while (nn1 < n); /*-------------------------------------------------------------------- c nn1 is the smallest power of two not less than n c-------------------------------------------------------------------*/ while (nzv < nz) { vecelt = randlc(&tran, amult); /*-------------------------------------------------------------------- c generate an integer between 1 and n in a portable manner c-------------------------------------------------------------------*/ vecloc = randlc(&tran, amult); i = icnvrt(vecloc, nn1) + 1; if (i > n) continue; /*-------------------------------------------------------------------- c was this integer generated already? c-------------------------------------------------------------------*/ if (mark[i] == 0) { mark[i] = 1; nzrow = nzrow + 1; nzloc[nzrow] = i; nzv = nzv + 1; v[nzv] = vecelt; iv[nzv] = i; } } for (ii = 1; ii <= nzrow; ii++) { i = nzloc[ii]; mark[i] = 0; } }
double Xi1double(double x, int N = 100) // default argument: N = 100 { cout << setiosflags(ios::uppercase); // int N = 100; double xi,xk,a; // double b[100],c[100]; static double pi = 4.0 * atan(1.0); double T = 30.0; double f = 2.0 * pi/T; double* const b = new double [N]; double* const c = new double [N]; int j; xk = 1.0; double x1 = 3.0; a = 3.0; xi = 0.0; for(int i = 0; i < N; i++) { j = N - i - 1 ; b[j] = randlc(xk,a); c[j] = randlc(x1,a); } for(i = 0; i < N; i++) { xi += b[i] * cos(i*f*x) + c[i] * sin(i*f*x); if(fabs(xi) > 10.0) { xi = xi/100.0;} else { xi = xi;} } if(fabs(xi) > 1.0) {xi = xi/10.0;} else { xi = xi ;} delete [] b; delete [] c; return xi; }
void create_seq( double seed, double a ) { double x; int i, k; k = MAX_KEY/4; for (i=0; i<NUM_KEYS; i++) { x = randlc(&seed, &a); x += randlc(&seed, &a); x += randlc(&seed, &a); x += randlc(&seed, &a); key_array[i] = k*x; } }
double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */ int np, /* np = num procs */ long nn, /* total num of ran numbers, all procs */ double s, /* Ran num seed, for ex.: 314159265.00 */ double a ) /* Ran num gen mult, try 1220703125.00 */ { long i; double t1,t2,t3,an; long mq,nq,kk,ik; nq = nn / np; for( mq=0; nq>1; mq++,nq/=2 ) ; t1 = a; for( i=1; i<=mq; i++ ) t2 = randlc( &t1, &t1 ); an = t1; kk = kn; t1 = s; t2 = an; for( i=1; i<=100; i++ ) { ik = kk / 2; if( 2 * ik != kk ) t3 = randlc( &t1, &t2 ); if( ik == 0 ) break; t3 = randlc( &t2, &t2 ); kk = ik; } return( t1 ); }
void test04 ( void ) /******************************************************************************/ /* Purpose: RANDLC_TEST04 tests RANDLC_JUMP. Licensing: This code is distributed under the GNU LGPL license. Modified: 12 March 2010 Author: John Burkardt */ { int i; int k; int klog; double seed; double x1; double x2; printf ( "\n" ); printf ( "RANDLC_TEST04\n" ); printf ( " RANDLC_JUMP jumps directly to the K-th value\n" ); printf ( " returned by RANDLC.\n" ); printf ( "\n" ); printf ( " K X(hard way) X(jump)\n" ); printf ( "\n" ); k = 1; for ( klog = 1; klog <= 10; klog++ ) { seed = 123456789.0; for ( i = 1; i <= k; i++ ) { x1 = randlc ( &seed ); } seed = 123456789.0; x2 = randlc_jump ( seed, k ); printf ( " %8d %10f %10f\n", k, x1, x2 ); k = k * 2; } return; }
void create_seq( double seed, double a ) { double x, s; INT_TYPE i, k; #pragma omp parallel private(x,s,i,k) { INT_TYPE k1, k2; double an = a; int myid, num_procs; INT_TYPE mq; #ifdef _OPENMP myid = omp_get_thread_num(); num_procs = omp_get_num_threads(); #else myid = 0; num_procs = 1; #endif mq = (NUM_KEYS + num_procs - 1) / num_procs; k1 = mq * myid; k2 = k1 + mq; if ( k2 > NUM_KEYS ) k2 = NUM_KEYS; KS = 0; s = find_my_seed( myid, num_procs, (long)4*NUM_KEYS, seed, an ); k = MAX_KEY/4; for (i=k1; i<k2; i++) { x = randlc(&s, &an); x += randlc(&s, &an); x += randlc(&s, &an); x += randlc(&s, &an); key_array[i] = k*x; } } /*omp parallel*/ }
static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c Fill in array u0 with initial conditions from c random number generator c-------------------------------------------------------------------*/ int k; double x0, start, an, dummy; static double tmp[NX*2*MAXDIM+1]; int i,j,t; start = SEED; /*-------------------------------------------------------------------- c Jump to the starting element for our first plane. c-------------------------------------------------------------------*/ ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an); dummy = randlc(&start, an); ipow46(A, 2*NX*NY, &an); /*-------------------------------------------------------------------- c Go through by z planes filling in one square at a time. c-------------------------------------------------------------------*/ for (k = 0; k < dims[0][2]; k++) { x0 = start; vranlc(2*NX*dims[0][1], &x0, A, tmp); t = 1; for (j = 0; j < dims[0][1]; j++) for (i = 0; i < NX; i++) { u0[k][j][i].real = tmp[t++]; u0[k][j][i].imag = tmp[t++]; } if (k != dims[0][2]) dummy = randlc(&start, an); } }
int GetFeatureNum(char *mbname,int id){ double tran=314159265.0; double A=2*id+1; double denom=randlc(&tran,&A); char cval='S'; int mean=NUM_SAMPLES,stdev=128; int rtfs=0,len=0; GetFNumDPar(&mean,&stdev); rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev); if(rtfs<0) rtfs=-rtfs; len=mean-stdev+rtfs; return len; }
static void ipow46(double a, int exponent, double *result) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c compute a^exponent mod 2^46 c-------------------------------------------------------------------*/ double dummy, q, r; int n, n2; /*-------------------------------------------------------------------- c Use c a^n = a^(n/2)*a^(n/2) if n even else c a^n = a*a^(n-1) if n odd c-------------------------------------------------------------------*/ *result = 1; if (exponent == 0) return; q = a; r = 1; n = exponent; while (n > 1) { n2 = n/2; if (n2 * 2 == n) { dummy = randlc(&q, q); n = n2; } else { dummy = randlc(&r, q); n = n-1; } } dummy = randlc(&r, q); *result = r; }
void test01 ( void ) /******************************************************************************/ /* Purpose: TEST01 tests RANDLC. Licensing: This code is distributed under the GNU LGPL license. Modified: 08 March 2010 Author: John Burkardt */ { int i; double seed; double seed_init = 123456789.0; printf ( "\n" ); printf ( "TEST01\n" ); printf ( " RANDLC computes pseudorandom values \n" ); printf ( " in the interval [0,1].\n" ); seed = seed_init; printf ( "\n" ); printf ( " The initial seed is %14.0f\n", seed_init ); printf ( "\n" ); printf ( " I RANDLC\n" ); printf ( "\n" ); for ( i = 1; i <= 10; i++ ) { printf ( " %8d %14f\n", i, randlc ( &seed ) ); } return; }
static void zran3(double *z, int n1, int n2, int n3, int nx, int ny, int k) { /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c zran3 loads +1 at ten randomly chosen points, c loads -1 at a different ten random points, c and zero elsewhere. c-------------------------------------------------------------------*/ #define MM 10 #define A pow(5.0,13) #define X 314159265.e0 int i0, m0, m1; int i1, i2, i3, d1, e1, e2, e3; double xx, x0, x1, a1, a2, ai; double ten[MM][2], best; int i, j1[MM][2], j2[MM][2], j3[MM][2]; int jg[4][MM][2]; double rdummy; a1 = power( A, nx ); a2 = power( A, nx*ny ); #if 0 #pragma omp parallel { zero3(z,n1,n2,n3); } #else #pragma omp parallel for private(i2, i1) for (i3 = 0;i3 < n3; i3++) { for (i2 = 0; i2 < n2; i2++) { for (i1 = 0; i1 < n1; i1++) { int i123 = i1 + n1*(i2 + n2*i3); z[i123] = 0.0; } } } #endif i = is1-1+nx*(is2-1+ny*(is3-1)); ai = power( A, i ); d1 = ie1 - is1 + 1; e1 = ie1 - is1 + 2; e2 = ie2 - is2 + 2; e3 = ie3 - is3 + 2; x0 = X; rdummy = randlc( &x0, ai ); for (i3 = 1; i3 < e3; i3++) { x1 = x0; for (i2 = 1; i2 < e2; i2++) { xx = x1; vranlc( d1, &xx, A, &(z[0+n1*(i2 + n2*i3)])); rdummy = randlc( &x1, a1 ); } rdummy = randlc( &x0, a2 ); } /*-------------------------------------------------------------------- c call comm3(z,n1,n2,n3) c call showall(z,n1,n2,n3) c-------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c each processor looks for twenty candidates c-------------------------------------------------------------------*/ for (i = 0; i < MM; i++) { ten[i][1] = 0.0; j1[i][1] = 0; j2[i][1] = 0; j3[i][1] = 0; ten[i][0] = 1.0; j1[i][0] = 0; j2[i][0] = 0; j3[i][0] = 0; } for (i3 = 1; i3 < n3-1; i3++) { for (i2 = 1; i2 < n2-1; i2++) { for (i1 = 1; i1 < n1-1; i1++) { int i123 = i1 + n1*(i2 + n2*i3); if ( z[i123] > ten[0][1] ) { ten[0][1] = z[i123]; j1[0][1] = i1; j2[0][1] = i2; j3[0][1] = i3; bubble( ten, j1, j2, j3, MM, 1 ); } if ( z[i123] < ten[0][0] ) { ten[0][0] = z[i123]; j1[0][0] = i1; j2[0][0] = i2; j3[0][0] = i3; bubble( ten, j1, j2, j3, MM, 0 ); } } } } /*-------------------------------------------------------------------- c Now which of these are globally best? c-------------------------------------------------------------------*/ i1 = MM - 1; i0 = MM - 1; for (i = MM - 1 ; i >= 0; i--) { int j123 = j1[i1][1] + n1*(j2[i1][1] + n2*j3[i1][1]); best = z[j123]; if (best == z[j123]) { jg[0][i][1] = 0; jg[1][i][1] = is1 - 1 + j1[i1][1]; jg[2][i][1] = is2 - 1 + j2[i1][1]; jg[3][i][1] = is3 - 1 + j3[i1][1]; i1 = i1-1; } else { jg[0][i][1] = 0; jg[1][i][1] = 0; jg[2][i][1] = 0; jg[3][i][1] = 0; } ten[i][1] = best; j123 = j1[i0][0] + n1*(j2[i0][0] + n2*j3[i0][0]); best = z[j123]; if (best == z[j123]) { jg[0][i][0] = 0; jg[1][i][0] = is1 - 1 + j1[i0][0]; jg[2][i][0] = is2 - 1 + j2[i0][0]; jg[3][i][0] = is3 - 1 + j3[i0][0]; i0 = i0-1; } else { jg[0][i][0] = 0; jg[1][i][0] = 0; jg[2][i][0] = 0; jg[3][i][0] = 0; } ten[i][0] = best; } m1 = i1+1; m0 = i0+1; /* printf(" negative charges at"); for (i = 0; i < MM; i++) { if (i%5 == 0) printf("\n"); printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]); } printf("\n positive charges at"); for (i = 0; i < MM; i++) { if (i%5 == 0) printf("\n"); printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]); } printf("\n small random numbers were\n"); for (i = MM-1; i >= 0; i--) { printf(" %15.8e", ten[i][0]); } printf("\n and they were found on processor number\n"); for (i = MM-1; i >= 0; i--) { printf(" %4d", jg[0][i][0]); } printf("\n large random numbers were\n"); for (i = MM-1; i >= 0; i--) { printf(" %15.8e", ten[i][1]); } printf("\n and they were found on processor number\n"); for (i = MM-1; i >= 0; i--) { printf(" %4d", jg[0][i][1]); } printf("\n");*/ #if 0 #pragma omp parallel for private(i2, i1) for (i3 = 0; i3 < n3; i3++) { for (i2 = 0; i2 < n2; i2++) { for (i1 = 0; i1 < n1; i1++) { int i123 = i1 + n1*(i2+n2*i3); z[i123] = 0.0; } } } #else #pragma omp parallel { zero3(z,n1,n2,n3); } #endif #pragma acc parallel present(z[0:n3*n2*n1]) copyin(jg) { #pragma acc loop for (i = MM-1; i >= m0; i--) { int j123 = j1[i][0] + n1*(j2[i][0] + n2*j3[i][0]); z[j123] = -1.0; } #pragma acc loop for (i = MM-1; i >= m1; i--) { int j123 = j1[i][1] + n1*(j2[i][1] + n2*j3[i][1]); z[j123] = 1.0; } } // end acc parallel #pragma omp parallel comm3(z,n1,n2,n3,k); /*-------------------------------------------------------------------- c call showall(z,n1,n2,n3) c-------------------------------------------------------------------*/ }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }
void test02 ( void ) /******************************************************************************/ /* Purpose: TEST02 tests RANDLC; Licensing: This code is distributed under the GNU LGPL license. Modified: 08 March 2010 Author: John Burkardt */ { # define N 1000 int i; double seed; double seed_in; double seed_out; double u[N]; double u_avg; double u_var; printf ( "\n" ); printf ( "TEST02\n" ); printf ( " RANDLC computes a sequence of uniformly distributed\n" ); printf ( " pseudorandom numbers.\n" ); seed = 123456789.0; printf ( "\n" ); printf ( " Initial SEED = %14.0f\n", seed ); printf ( "\n" ); printf ( " First 10 values:\n" ); printf ( "\n" ); printf ( " I Input Output RANDLC\n" ); printf ( " SEED SEED\n" ); printf ( "\n" ); for ( i = 0; i < 10; i++ ) { seed_in = seed; u[i] = randlc ( &seed ); seed_out = seed; printf ( " %6d %14.0f %14.0f %10f\n", i + 1, seed_in, seed_out, u[i] ); } printf ( "\n" ); printf ( " Now call RANDLC %d times.\n", N ); u_avg = 0.0; for ( i = 0; i < N; i++ ) { u[i] = randlc ( &seed ); u_avg = u_avg + u[i]; } u_avg = u_avg / ( ( double ) N ); u_var = 0.0; for ( i = 0; i < N; i++ ) { u_var = u_var + ( u[i] - u_avg ) * ( u[i] - u_avg ); } u_var = u_var / ( ( double ) ( N - 1 ) ); printf ( "\n" ); printf ( " Average value = %f\n", u_avg ); printf ( " Expecting %f\n", 0.5 ); printf ( "\n" ); printf ( " Variance = %f\n", u_var ); printf ( " Expecting %f\n", 1.0 / 12.0 ); return; # undef N }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; int nthreads = 1; boolean verified; char size[13+1]; /* character*13 */ /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 3.0 structured OpenMP C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); #pragma omp parallel for default(shared) private(i) for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for ( i = 0; i <= NQ - 1; i++) { q[i] = 0.0; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; #pragma omp parallel copyin(x) { double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double qq[NQ]; /* private copy of q[0:NQ-1] */ for (i = 0; i < NQ; i++) qq[i] = 0.0; #pragma omp for reduction(+:sx,sy) schedule(static) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ if (TIMERS_ENABLED == TRUE) timer_start(3); vranlc(2*NK, &t1, A, x-1); if (TIMERS_ENABLED == TRUE) timer_stop(3); /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ if (TIMERS_ENABLED == TRUE) timer_start(2); for ( i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[l] += 1.0; /* counts */ sx = sx + t3; /* sum of Xi */ sy = sy + t4; /* sum of Yi */ } } if (TIMERS_ENABLED == TRUE) timer_stop(2); } #pragma omp critical { for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; } #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end of parallel region */ for (i = 0; i <= NQ-1; i++) { gc = gc + q[i]; } timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (TIMERS_ENABLED == TRUE) { printf("Total time: %f", timer_read(1)); printf("Gaussian pairs: %f", timer_read(2)); printf("Random numbers: %f", timer_read(3)); } }
void test03 ( void ) /******************************************************************************/ /* Purpose: TEST03 tests RANDLC. Licensing: This code is distributed under the GNU LGPL license. Modified: 08 March 2010 Author: John Burkardt */ { int i; double seed; double seed_in; double seed_out; double seed_save; double x; printf ( "\n" ); printf ( "TEST03\n" ); printf ( " RANDLC computes a sequence of pseudorandom numbers\n" ); printf ( " but all computations depend on the seed value.\n" ); printf ( " In this test, we show how a sequence of \"random\"\n" ); printf ( " values can be manipulated by accessing the seed.\n" ); seed = 1066.0; printf ( "\n" ); printf ( " Set SEED to %14.0f\n", seed ); printf ( "\n" ); printf ( " Now call RANDLC 10 times, and watch SEED.\n" ); printf ( "\n" ); printf ( " I Input Output RANDLC\n" ); printf ( " SEED SEED\n" ); printf ( "\n" ); for ( i = 1; i <= 10; i++ ) { seed_in = seed; if ( i == 5 ) { seed_save = seed; } x = randlc ( &seed ); seed_out = seed; printf ( " %6d %14.0f %14.0f %10f\n", i, seed_in, seed_out, x ); } seed = seed_save; printf ( "\n" ); printf ( " Reset SEED to its value at step 5, = %14.0f\n", seed ); printf ( "\n" ); printf ( " Now call RANDLC 10 times, and watch how SEED\n" ); printf ( " and RANDLC restart themselves.\n" ); printf ( "\n" ); printf ( " I Input Output RANDLC\n" ); printf ( " SEED SEED\n" ); printf ( "\n" ); for ( i = 1; i <= 10; i++ ) { seed_in = seed; x = randlc ( &seed ); seed_out = seed; printf ( " %6d %14.0f %14.0f %10f\n", i, seed_in, seed_out, x ); } seed = 0.0; printf ( "\n" ); printf ( " What happens with an initial zero SEED?\n" ); printf ( "\n" ); printf ( " I Input Output RANDLC\n" ); printf ( " SEED SEED\n" ); printf ( "\n" ); for ( i = 1; i <= 10; i++ ) { seed_in = seed; x = randlc ( &seed ); seed_out = seed; printf ( " %6d %14.0f %14.0f %10f\n", i, seed_in, seed_out, x ); } seed = -123456789.0; printf ( "\n" ); printf ( " What happens with an initial negative SEED?\n" ); printf ( "\n" ); printf ( " I Input Output RANDLC\n" ); printf ( " SEED SEED\n" ); printf ( "\n" ); for ( i = 1; i <= 10; i++ ) { seed_in = seed; x = randlc ( &seed ); seed_out = seed; printf ( " %6d %14.0f %14.0f %10f\n", i, seed_in, seed_out, x ); } return; }
//--------------------------------------------------------------------- // Fill in array u0 with initial conditions from // random number generator //--------------------------------------------------------------------- static void compute_initial_conditions(cl_mem *u0, int d1, int d2, int d3) { int k; double start, an, dummy, starts[NZ]; size_t local_ws, global_ws, temp; cl_mem m_starts; cl_int ecode; start = SEED; //--------------------------------------------------------------------- // Jump to the starting element for our first plane. //--------------------------------------------------------------------- an = ipow46(A, 0); dummy = randlc(&start, an); an = ipow46(A, 2*NX*NY); starts[0] = start; for (k = 1; k < dims[2]; k++) { dummy = randlc(&start, an); starts[k] = start; } if (device_type == CL_DEVICE_TYPE_CPU) { m_starts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double) * NZ, starts, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_starts"); local_ws = 1; global_ws = clu_RoundWorkSize((size_t)d2, local_ws); } else { //GPU m_starts = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double) * NZ, starts, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_starts"); temp = d2 / max_compute_units; local_ws = temp == 0 ? 1 : ((temp > work_item_sizes[0]) ? work_item_sizes[0] : temp); global_ws = clu_RoundWorkSize((size_t)d2, local_ws); } ecode = clSetKernelArg(k_compute_ics, 0, sizeof(cl_mem), u0); ecode |= clSetKernelArg(k_compute_ics, 1, sizeof(cl_mem), &m_starts); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_compute_ics, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); ecode = clFinish(cmd_queue); clu_CheckError(ecode, "clFinish()"); DTIMER_START(T_RELEASE); clReleaseMemObject(m_starts); DTIMER_STOP(T_RELEASE); }
int main (int argc, char **argv) { //auto double *_ppthd_x; auto double Mops; auto double t1; auto double t2; auto double t3; auto double t4; auto double x1; auto double x2; auto double sx; auto double sy; auto double tm; auto double an; auto double tt; auto double gc; auto double dum[3]; auto int np; auto int ierr; auto int node; auto int no_nodes; auto int i; auto int ik; auto int kk; auto int l; auto int k; auto int nit; auto int ierrcode; auto int no_large_nodes; auto int np_add; auto int k_offset; auto int j; auto int nthreads; auto int verified; auto char size[14]; int status = 0; _ompc_init(argc,argv); //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x)))); (*(dum)) = (1.0); (*((dum) + (1))) = (1.0); (*((dum) + (2))) = (1.0); (nthreads) = (1); # 84 "ep.c" printf ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012"); # 86 "ep.c" sprintf (size, "%12.0f", pow (2.0, (28) + (1))); # 87 "ep.c" for ((j) = (13); (j) >= (1); (j)--) { # 88 "ep.c" if ((((int) (*((size) + (j))))) == (46)) { (*((size) + (j))) = (((char) (32))); } } # 90 "ep.c" printf (" Number of random numbers generated: %13s\012", size); # 92 "ep.c" (verified) = (0); # 99 "ep.c" (np) = ((1) << ((28) - (16))); # 107 "ep.c" vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2)); # 108 "ep.c" (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2)))); # 109 "ep.c" for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++) { x[i] = (-(1.0E99)); //(*((_ppthd_x) + (i))) = (-(1.0E99)); } # 110 "ep.c" (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0))))); # 112 "ep.c" timer_clear (1); # 113 "ep.c" timer_clear (2); # 114 "ep.c" timer_clear (3); # 115 "ep.c" timer_start (1); # 117 "ep.c" vranlc (0, &(t1), 1.220703125E9, x); //vranlc (0, &(t1), 1.220703125E9, _ppthd_x); # 121 "ep.c" (t1) = (1.220703125E9); # 123 "ep.c" for ((i) = (1); (i) <= ((16) + (1)); (i)++) { # 124 "ep.c" (t2) = (randlc (&(t1), t1)); } # 127 "ep.c" (an) = (t1); # 128 "ep.c" (tt) = (2.71828183E8); # 129 "ep.c" (gc) = (0.0); # 130 "ep.c" (sx) = (0.0); # 131 "ep.c" (sy) = (0.0); # 133 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 134 "ep.c" (*((q) + (i))) = (0.0); } # 142 "ep.c" (k_offset) = (-(1)); { auto void *__ompc_argv[6]; (*(__ompc_argv)) = (((void *) (&sx))); (*((__ompc_argv) + (1))) = (((void *) (&sy))); (*((__ompc_argv) + (2))) = (((void *) (&np))); (*((__ompc_argv) + (3))) = (((void *) (&k_offset))); (*((__ompc_argv) + (4))) = (((void *) (&an))); (*((__ompc_argv) + (5))) = (((void *) (&nthreads))); _ompc_do_parallel (__ompc_func_3, __ompc_argv); } # 207 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 208 "ep.c" (gc) = ((gc) + (*((q) + (i)))); } # 211 "ep.c" timer_stop (1); # 212 "ep.c" (tm) = (timer_read (1)); # 214 "ep.c" (nit) = (0); # 215 "ep.c" if ((28) == (24)) { # 216 "ep.c" if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8))) { # 218 "ep.c" (verified) = (1); } } else # 220 "ep.c" if ((28) == (25)) { # 221 "ep.c" if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8))) { # 223 "ep.c" (verified) = (1); } } else # 225 "ep.c" if ((28) == (28)) { # 226 "ep.c" if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8))) { # 228 "ep.c" (verified) = (1); printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy); } } else # 230 "ep.c" if ((28) == (30)) { # 231 "ep.c" if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8))) { # 233 "ep.c" (verified) = (1); } } else # 235 "ep.c" if ((28) == (32)) { # 236 "ep.c" if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8))) { # 238 "ep.c" (verified) = (1); } } # 242 "ep.c" (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0)); # 244 "ep.c" printf ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012", tm, 28, gc, sx, sy); # 251 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 252 "ep.c" printf ("%3d %15.0f\012", i, *((q) + (i))); } # 255 "ep.c" c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, "2.3", "07 Aug 2006", "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm", "randdp"); # 261 "ep.c" if ((0) == (1)) { # 262 "ep.c" printf ("Total time: %f", timer_read (1)); # 263 "ep.c" printf ("Gaussian pairs: %f", timer_read (2)); # 264 "ep.c" printf ("Random numbers: %f", timer_read (3)); } }
static void __ompc_func_3 (void **__ompc_args) { auto double *_pp_sx; auto double *_pp_sy; auto int *_pp_np; auto int *_pp_k_offset; auto double *_pp_an; auto int *_pp_nthreads; auto double *_ppthd_x; (_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x)))); (_pp_sx) = (((double *) (*__ompc_args))); (_pp_sy) = (((double *) (*((__ompc_args) + (1))))); (_pp_np) = (((int *) (*((__ompc_args) + (2))))); (_pp_k_offset) = (((int *) (*((__ompc_args) + (3))))); (_pp_an) = (((double *) (*((__ompc_args) + (4))))); (_pp_nthreads) = (((int *) (*((__ompc_args) + (5))))); _ompc_copyin_thdprv (_ppthd_x, x, 1048576); { auto double t1; auto double t2; auto double t3; auto double t4; auto double x1; auto double x2; auto int kk; auto int i; auto int ik; auto int l; auto double qq[10]; # 150 "ep.c" for ((i) = (0); (i) < (10); (i)++) { (*((qq) + (i))) = (0.0); } { auto double _p_sx; auto double _p_sy; auto int _p_k; auto int _p_k_0; auto int _p_k_1; auto int _p_k_2; (_p_sy) = (0.0); (_p_sx) = (0.0); (_p_k_0) = (1); (_p_k_1) = ((*_pp_np) + (1)); (_p_k_2) = (1); _ompc_static_bsched (&_p_k_0, &_p_k_1, &_p_k_2); # 153 "ep.c" for ((_p_k) = (_p_k_0); (_p_k) < (_p_k_1); (_p_k) += (_p_k_2)) { # 154 "ep.c" (kk) = ((*_pp_k_offset) + (_p_k)); # 155 "ep.c" (t1) = (2.71828183E8); # 156 "ep.c" (t2) = (*_pp_an); # 160 "ep.c" for ((i) = (1); (i) <= (100); (i)++) { # 161 "ep.c" (ik) = ((kk) / (2)); # 162 "ep.c" if (((2) * (ik)) != (kk)) { (t3) = (randlc (&(t1), t2)); } # 163 "ep.c" if ((ik) == (0)) # 163 "ep.c" break; # 164 "ep.c" (t3) = (randlc (&(t2), t2)); # 165 "ep.c" (kk) = (ik); } # 170 "ep.c" if ((0) == (1)) { timer_start (3); } # 171 "ep.c" vranlc ((2) * ((1) << (16)), &(t1), 1.220703125E9, (_ppthd_x) - (1)); # 172 "ep.c" if ((0) == (1)) { timer_stop (3); } # 179 "ep.c" if ((0) == (1)) { timer_start (2); } # 181 "ep.c" for ((i) = (0); (i) < ((1) << (16)); (i)++) { # 182 "ep.c" (x1) = (((2.0) * (*((_ppthd_x) + ((2) * (i))))) - (1.0)); # 183 "ep.c" (x2) = (((2.0) * (*((_ppthd_x) + (((2) * (i)) + (1))))) - (1.0)); # 184 "ep.c" (t1) = (((x1) * (x1)) + ((x2) * (x2))); # 185 "ep.c" if ((t1) <= (1.0)) { # 186 "ep.c" (t2) = (sqrt (((-(2.0)) * (log (t1))) / (t1))); # 187 "ep.c" (t3) = ((x1) * (t2)); # 188 "ep.c" (t4) = ((x2) * (t2)); # 189 "ep.c" (l) = (((int) (((fabs (t3)) > (fabs (t4))) ? (fabs (t3)) : (fabs (t4))))); # 190 "ep.c" (*((qq) + (l))) += (1.0); # 191 "ep.c" (_p_sx) = ((_p_sx) + (t3)); # 192 "ep.c" (_p_sy) = ((_p_sy) + (t4)); } } # 195 "ep.c" if ((0) == (1)) { timer_stop (2); } } _ompc_reduction (&_p_sy, _pp_sy, 14, 6); _ompc_reduction (&_p_sx, _pp_sx, 14, 6); _ompc_barrier (); } { _ompc_enter_critical (&__ompc_lock_critical); # 199 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { (*((q) + (i))) += (*((qq) + (i))); } _ompc_exit_critical (&__ompc_lock_critical); } if (_ompc_is_master ()) { (*_pp_nthreads) = (omp_get_num_threads ()); } } }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double *x, **xx, *q, **qq; double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; const int TRANSFER_X = 1; int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; double loc_x,loc_t1,loc_t2,loc_t3,loc_t4; double loc_a1,loc_a2,loc_x1,loc_x2,loc_z; boolean verified; char size[13+1]; /* character*13 */ /* Allocate working memory */ x = (double*) malloc(sizeof(double) * 2*NK); xx = (double**) malloc(sizeof(double*) * NN); xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK); for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK); q = (double*) malloc(sizeof(double) * NQ); qq = (double**) malloc(sizeof(double*) * NN); qq[0] = (double*) malloc(sizeof(double) * NN * NQ); for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ; /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ #pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \ copyout(q[0:NQ]) { vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); #pragma acc update device(x[0:2*NK]) /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; #pragma acc parallel loop for (k = 0; k < np; k++) { /* Initialize private q (qq) */ #pragma acc loop for (i = 0; i < NQ; i++) qq[k][i] = 0.0; /* Initialize private x (xx) */ #pragma acc loop for (i = 0; i < 2*NK; i++) xx[k][i] = x[i]; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double psx, psy; #pragma acc parallel loop reduction(+:sx,sy) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ #pragma acc loop seq for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = RANDLC(&t1, t2); if (ik == 0) break; t3 = RANDLC(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ loc_t1 = r23 * A; loc_a1 = (int)loc_t1; loc_a2 = A - t23 * loc_a1; loc_x = t1; #pragma acc loop seq for (i = 1; i <= 2*NK; i++) { loc_t1 = r23 * loc_x; loc_x1 = (int)loc_t1; loc_x2 = loc_x - t23 * loc_x1; loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1; loc_t2 = (int)(r23 * loc_t1); loc_z = loc_t1 - t23 * loc_t2; loc_t3 = t23 * loc_z + loc_a2 * loc_x2; loc_t4 = (int)(r46 * loc_t3); loc_x = loc_t3 - t46 * loc_t4; xx[k-1][i-1] = r46 * loc_x; } t1 = loc_x; /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ psx = psy = 0.0; #pragma acc loop reduction(+:psx,psy) for ( i = 0; i < NK; i++) { x1 = 2.0 * xx[k-1][2*i] - 1.0; x2 = 2.0 * xx[k-1][2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[k-1][l] += 1.0; /* counts */ psx = psx + t3; /* sum of Xi */ psy = psy + t4; /* sum of Yi */ } } sx += psx; sy += psy; } /* Reduce private qq to q */ #pragma acc parallel loop reduction(+:gc) for ( i = 0; i < NQ; i++ ) { double sumq = 0.0; #pragma acc loop reduction(+:sumq) for (k = 0; k < np; k++) sumq = sumq + qq[k][i]; q[i] = sumq; gc += sumq; } } /* end acc data */ timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); return 0; }
int main() { double Mops, t1, t2, t3, t4, x1, x2; double sx, sy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int np; int i, ik, kk, l, k, nit; int k_offset, j; logical verified, timers_enabled; double dum[3] = {1.0, 1.0, 1.0}; char size[16]; FILE *fp; if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; //-------------------------------------------------------------------- // Call the random number generator functions and initialize // the x-array to reduce the effects of paging on the timings. // Also, call all mathematical functions that are used. Make // sure these initializations cannot be eliminated as dead code. //-------------------------------------------------------------------- vranlc(0, &dum[0], dum[1], &dum[2]); dum[0] = randlc(&dum[1], dum[2]); for (i = 0; i < 2 * NK; i++) { x[i] = -1.0e99; } Mops = log(sqrt(fabs(MAX(1.0, 1.0)))); timer_clear(0); timer_clear(1); timer_clear(2); timer_start(0); t1 = A; vranlc(0, &t1, A, x); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; // Find starting seed t1 for this kk. for (i = 1; i <= 100; i++) { ik = kk / 2; if ((2 * ik) != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } //-------------------------------------------------------------------- // Compute uniform pseudorandom numbers. //-------------------------------------------------------------------- if (timers_enabled) timer_start(2); vranlc(2 * NK, &t1, A, x); if (timers_enabled) timer_stop(2); //-------------------------------------------------------------------- // Compute Gaussian deviates by acceptance-rejection method and // tally counts in concentri//square annuli. This loop is not // vectorizable. //-------------------------------------------------------------------- if (timers_enabled) timer_start(1); for (i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = x1 * x1 + x2 * x2; if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); t4 = (x2 * t2); l = MAX(fabs(t3), fabs(t4)); q[l] = q[l] + 1.0; sx = sx + t3; sy = sy + t4; } } if (timers_enabled) timer_stop(1); } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((sx - sx_verify_value) / sx_verify_value); sy_err = fabs((sy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", sx, sy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); tt = timer_read(1); printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); tt = timer_read(2); printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } return 0; }
int main(int argc, char *argv[]) { int i, j, k, it; double zeta; double rnorm; double norm_temp1, norm_temp2; double t, mflops, tmax; //char Class; logical verified; double zeta_verify_value, epsilon, err; char *t_names[T_last]; //openmp environment setting omp_set_dynamic(0); omp_set_num_threads(8); for (i = 0; i < T_last; i++) { timer_clear(i); } timer_start(T_init); firstrow = 0; lastrow = NA-1; firstcol = 0; lastcol = NA-1; zeta_verify_value = VALID_RESULT; printf("\nCG start...\n\n"); printf(" Size: %11d\n", NA); printf(" Iterations: %5d\n", NITER); printf("\n"); naa = NA; nzz = NZ; //--------------------------------------------------------------------- // Inialize random number generator //--------------------------------------------------------------------- tran = 314159265.0; amult = 1220703125.0; zeta = randlc(&tran, amult); //--------------------------------------------------------------------- // //--------------------------------------------------------------------- makea(naa, nzz, a, colidx, rowstr, firstrow, lastrow, firstcol, lastcol, arow, (int (*)[NONZER+1])(void*)acol, (double (*)[NONZER+1])(void*)aelt, iv); //--------------------------------------------------------------------- // Note: as a result of the above call to makea: // values of j used in indexing rowstr go from 0 --> lastrow-firstrow // values of colidx which are col indexes go from firstcol --> lastcol // So: // Shift the col index vals from actual (firstcol --> lastcol ) // to local, i.e., (0 --> lastcol-firstcol) //--------------------------------------------------------------------- #pragma omp parallel for collapse(2) for (j = 0; j < lastrow - firstrow + 1; j++) { for (k = rowstr[j]; k < rowstr[j+1]; k++) { colidx[k] = colidx[k] - firstcol; } } //--------------------------------------------------------------------- // set starting vector to (1, 1, .... 1) //--------------------------------------------------------------------- #pragma omp parallel for for (i = 0; i < NA+1; i++) { x[i] = 1.0; } #pragma omp parallel for for (j = 0; j < lastcol - firstcol + 1; j++) { q[j] = 0.0; z[j] = 0.0; r[j] = 0.0; p[j] = 0.0; } zeta = 0.0; //--------------------------------------------------------------------- //----> // Do one iteration untimed to init all code and data page tables //----> (then reinit, start timing, to niter its) //--------------------------------------------------------------------- for (it = 1; it <= 1; it++) { //--------------------------------------------------------------------- // The call to the conjugate gradient routine: //--------------------------------------------------------------------- conj_grad(colidx, rowstr, x, z, a, p, q, r, &rnorm); //--------------------------------------------------------------------- // zeta = shift + 1/(x.z) // So, first: (x.z) // Also, find norm of z // So, first: (z.z) //--------------------------------------------------------------------- norm_temp1 = 0.0; norm_temp2 = 0.0; #pragma omp parallel for reduction(+:norm_temp1, norm_temp2) for (j = 0; j < lastcol - firstcol + 1; j++) { norm_temp1 = norm_temp1 + x[j] * z[j]; norm_temp2 = norm_temp2 + z[j] * z[j]; } norm_temp2 = 1.0 / sqrt(norm_temp2); //--------------------------------------------------------------------- // Normalize z to obtain x //--------------------------------------------------------------------- #pragma omp parallel for for (j = 0; j < lastcol - firstcol + 1; j++) { x[j] = norm_temp2 * z[j]; } } // end of do one iteration untimed //--------------------------------------------------------------------- // set starting vector to (1, 1, .... 1) //--------------------------------------------------------------------- #pragma omp parallel for for (i = 0; i < NA+1; i++) { x[i] = 1.0; } zeta = 0.0; timer_stop(T_init); printf(" Initialization time = %15.3f seconds\n", timer_read(T_init)); timer_start(T_bench); //--------------------------------------------------------------------- //----> // Main Iteration for inverse power method //----> //--------------------------------------------------------------------- /* #pragma omp parallel for reduction(+:zeta) private(norm_temp1, norm_temp2) firstprivate(x, z, p, q) */ for (it = 1; it <= NITER; it++) { //--------------------------------------------------------------------- // The call to the conjugate gradient routine: //--------------------------------------------------------------------- if (timeron) timer_start(T_conj_grad); conj_grad(colidx, rowstr, x, z, a, p, q, r, &rnorm); if (timeron) timer_stop(T_conj_grad); //--------------------------------------------------------------------- // zeta = shift + 1/(x.z) // So, first: (x.z) // Also, find norm of z // So, first: (z.z) //--------------------------------------------------------------------- norm_temp1 = 0.0; norm_temp2 = 0.0; #pragma omp parallel for reduction(+:norm_temp1, norm_temp2) for (j = 0; j < lastcol - firstcol + 1; j++) { norm_temp1 = norm_temp1 + x[j]*z[j]; norm_temp2 = norm_temp2 + z[j]*z[j]; } norm_temp2 = 1.0 / sqrt(norm_temp2); zeta = SHIFT + 1.0 / norm_temp1; if (it == 1) printf("\n iteration ||r|| zeta\n"); printf(" %5d %20.14E%20.13f\n", it, rnorm, zeta); //--------------------------------------------------------------------- // Normalize z to obtain x //--------------------------------------------------------------------- #pragma omp parallel for for (j = 0; j < lastcol - firstcol + 1; j++) { x[j] = norm_temp2 * z[j]; } } // end of main iter inv pow meth timer_stop(T_bench); //--------------------------------------------------------------------- // End of timed section //--------------------------------------------------------------------- t = timer_read(T_bench); printf("\nComplete...\n"); epsilon = 1.0e-10; err = fabs(zeta - zeta_verify_value) / zeta_verify_value; if (err <= epsilon) { verified = true; printf(" VERIFICATION SUCCESSFUL\n"); printf(" Zeta is %20.13E\n", zeta); printf(" Error is %20.13E\n", err); } else { verified = false; printf(" VERIFICATION FAILED\n"); printf(" Zeta %20.13E\n", zeta); printf(" The correct zeta is %20.13E\n", zeta_verify_value); } printf("\n\nExecution time : %lf seconds\n\n", t); return 0; }
int main(int argc, char **argv) { int i, j, k, it; int nthreads = 1; double zeta; double rnorm; double norm_temp11; double norm_temp12; double t, mflops; char cclass; boolean verified; double zeta_verify_value, epsilon; firstrow = 1; lastrow = NA; firstcol = 1; lastcol = NA; if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { cclass = 'S'; zeta_verify_value = 8.5971775078648; } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { cclass = 'W'; zeta_verify_value = 10.362595087124; } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { cclass = 'A'; zeta_verify_value = 17.130235054029; } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { cclass = 'B'; zeta_verify_value = 22.712745482631; } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { cclass = 'C'; zeta_verify_value = 28.973605592845; } else { cclass = 'U'; } printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - CG Benchmark\n"); printf(" Size: %10d\n", NA); printf(" Iterations: %5d\n", NITER); naa = NA; nzz = NZ; /*-------------------------------------------------------------------- c Initialize random number generator c-------------------------------------------------------------------*/ tran = 314159265.0; amult = 1220703125.0; zeta = randlc( &tran, amult ); /*-------------------------------------------------------------------- c c-------------------------------------------------------------------*/ makea(naa, nzz, a, colidx, rowstr, NONZER, firstrow, lastrow, firstcol, lastcol, RCOND, arow, acol, aelt, v, iv, SHIFT); /*--------------------------------------------------------------------- c Note: as a result of the above call to makea: c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 c values of colidx which are col indexes go from firstcol --> lastcol c So: c Shift the col index vals from actual (firstcol --> lastcol ) c to local, i.e., (1 --> lastcol-firstcol+1) c---------------------------------------------------------------------*/ #pragma omp parallel private(it,i,j,k) { #pragma omp for nowait for (j = 1; j <= lastrow - firstrow + 1; j++) { for (k = rowstr[j]; k < rowstr[j+1]; k++) { colidx[k] = colidx[k] - firstcol + 1; } } /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ #pragma omp for nowait for (i = 1; i <= NA+1; i++) { x[i] = 1.0; } #pragma omp single zeta = 0.0; /*------------------------------------------------------------------- c----> c Do one iteration untimed to init all code and data page tables c----> (then reinit, start timing, to niter its) c-------------------------------------------------------------------*/ for (it = 1; it <= 1; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ #pragma omp single { norm_temp11 = 0.0; norm_temp12 = 0.0; } /* end single */ #pragma omp for reduction(+:norm_temp11,norm_temp12) for (j = 1; j <= lastcol-firstcol+1; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } #pragma omp single norm_temp12 = 1.0 / sqrt( norm_temp12 ); /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ #pragma omp for for (j = 1; j <= lastcol-firstcol+1; j++) { x[j] = norm_temp12*z[j]; } } /* end of do one iteration untimed */ /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ #pragma omp for nowait for (i = 1; i <= NA+1; i++) { x[i] = 1.0; } #pragma omp single zeta = 0.0; } /* end parallel */ timer_clear( 1 ); timer_start( 1 ); /*-------------------------------------------------------------------- c----> c Main Iteration for inverse power method c----> c-------------------------------------------------------------------*/ #pragma omp parallel private(it,i,j,k) { for (it = 1; it <= NITER; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ #pragma omp single { norm_temp11 = 0.0; norm_temp12 = 0.0; } /* end single */ #pragma omp for reduction(+:norm_temp11,norm_temp12) for (j = 1; j <= lastcol-firstcol+1; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } #pragma omp single { norm_temp12 = 1.0 / sqrt( norm_temp12 ); zeta = SHIFT + 1.0 / norm_temp11; } /* end single */ #pragma omp master { if( it == 1 ) { printf(" iteration ||r|| zeta\n"); } printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); } /* end master */ /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ #pragma omp for for (j = 1; j <= lastcol-firstcol+1; j++) { x[j] = norm_temp12*z[j]; } } /* end of main iter inv pow meth */ #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop( 1 ); /*-------------------------------------------------------------------- c End of timed section c-------------------------------------------------------------------*/ t = timer_read( 1 ); printf(" Benchmark completed\n"); epsilon = 1.0e-10; if (cclass != 'U') { if (fabs(zeta - zeta_verify_value) <= epsilon) { verified = TRUE; printf(" VERIFICATION SUCCESSFUL\n"); printf(" Zeta is %20.12e\n", zeta); printf(" Error is %20.12e\n", zeta - zeta_verify_value); } else { verified = FALSE; printf(" VERIFICATION FAILED\n"); printf(" Zeta %20.12e\n", zeta); printf(" The correct zeta is %20.12e\n", zeta_verify_value); } } else { verified = FALSE; printf(" Problem size unknown\n"); printf(" NO VERIFICATION PERFORMED\n"); } if ( t != 0.0 ) { mflops = (2.0*NITER*NA) * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 ) / t / 1000000.0; } else { mflops = 0.0; } c_print_results("CG", cclass, NA, 0, 0, NITER, nthreads, t, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); }