Exemplo n.º 1
0
int main()
{
  vranlc(x-1);
 /* 
   vranlc(x); // wrong translation!!
   */
  assert (x[0]==0.5);
  return 0;
}
Exemplo n.º 2
0
static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]) {

/*--------------------------------------------------------------------
c-------------------------------------------------------------------*/

/*--------------------------------------------------------------------
c Fill in array u0 with initial conditions from 
c random number generator 
c-------------------------------------------------------------------*/

    int k;
    double x0, start, an, dummy;
    static double tmp[NX*2*MAXDIM+1];
    int i,j,t;
      
    start = SEED;
/*--------------------------------------------------------------------
c Jump to the starting element for our first plane.
c-------------------------------------------------------------------*/
    ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an);
    dummy = randlc(&start, an);
    ipow46(A, 2*NX*NY, &an);
      
/*--------------------------------------------------------------------
c Go through by z planes filling in one square at a time.
c-------------------------------------------------------------------*/
    for (k = 0; k < dims[0][2]; k++) {
	x0 = start;
        vranlc(2*NX*dims[0][1], &x0, A, tmp);
	
	t = 1;
	for (j = 0; j < dims[0][1]; j++)
	  for (i = 0; i < NX; i++) {
	    u0[k][j][i].real = tmp[t++];
	    u0[k][j][i].imag = tmp[t++];
	  }
	      
        if (k != dims[0][2]) dummy = randlc(&start, an);
    }
}
Exemplo n.º 3
0
static void zran3(double *z, int n1, int n2, int n3, int nx, int ny, int k) {

/*--------------------------------------------------------------------
c-------------------------------------------------------------------*/

/*--------------------------------------------------------------------
c     zran3  loads +1 at ten randomly chosen points,
c     loads -1 at a different ten random points,
c     and zero elsewhere.
c-------------------------------------------------------------------*/

#define MM	10
#define	A	pow(5.0,13)
#define	X	314159265.e0    
    
    int i0, m0, m1;
    int i1, i2, i3, d1, e1, e2, e3;
    double xx, x0, x1, a1, a2, ai;

    double ten[MM][2], best;
    int i, j1[MM][2], j2[MM][2], j3[MM][2];
    int jg[4][MM][2];

    double rdummy;

    a1 = power( A, nx );
    a2 = power( A, nx*ny );

#if 0
#pragma omp parallel
  {
    zero3(z,n1,n2,n3);
  }
#else
#pragma omp parallel for private(i2, i1)    
  for (i3 = 0;i3 < n3; i3++) {
    for (i2 = 0; i2 < n2; i2++) {
      for (i1 = 0; i1 < n1; i1++) {
	int i123 = i1 + n1*(i2 + n2*i3);
	z[i123] = 0.0;
      }
    }
  }
#endif

    i = is1-1+nx*(is2-1+ny*(is3-1));

    ai = power( A, i );
    d1 = ie1 - is1 + 1;
    e1 = ie1 - is1 + 2;
    e2 = ie2 - is2 + 2;
    e3 = ie3 - is3 + 2;
    x0 = X;
    rdummy = randlc( &x0, ai );
    
    for (i3 = 1; i3 < e3; i3++) {
	x1 = x0;
	for (i2 = 1; i2 < e2; i2++) {
            xx = x1;
            vranlc( d1, &xx, A, &(z[0+n1*(i2 + n2*i3)]));
            rdummy = randlc( &x1, a1 );
	}
	rdummy = randlc( &x0, a2 );
    }

/*--------------------------------------------------------------------
c       call comm3(z,n1,n2,n3)
c       call showall(z,n1,n2,n3)
c-------------------------------------------------------------------*/

/*--------------------------------------------------------------------
c     each processor looks for twenty candidates
c-------------------------------------------------------------------*/
    for (i = 0; i < MM; i++) {
	ten[i][1] = 0.0;
	j1[i][1] = 0;
	j2[i][1] = 0;
	j3[i][1] = 0;
	ten[i][0] = 1.0;
	j1[i][0] = 0;
	j2[i][0] = 0;
	j3[i][0] = 0;
    }
    for (i3 = 1; i3 < n3-1; i3++) {
	for (i2 = 1; i2 < n2-1; i2++) {
            for (i1 = 1; i1 < n1-1; i1++) {
	      int i123 = i1 + n1*(i2 + n2*i3);
		if ( z[i123] > ten[0][1] ) {
		    ten[0][1] = z[i123];
		    j1[0][1] = i1;
		    j2[0][1] = i2;
		    j3[0][1] = i3;
		    bubble( ten, j1, j2, j3, MM, 1 );
		}
		if ( z[i123] < ten[0][0] ) {
		    ten[0][0] = z[i123];
		    j1[0][0] = i1;
		    j2[0][0] = i2;
		    j3[0][0] = i3;
		    bubble( ten, j1, j2, j3, MM, 0 );
		}
	    }
	}
    }

/*--------------------------------------------------------------------
c     Now which of these are globally best?
c-------------------------------------------------------------------*/
    i1 = MM - 1;
    i0 = MM - 1;
    for (i = MM - 1 ; i >= 0; i--) {
      int j123 = j1[i1][1] + n1*(j2[i1][1] + n2*j3[i1][1]);
	best = z[j123];
	if (best == z[j123]) {
            jg[0][i][1] = 0;
            jg[1][i][1] = is1 - 1 + j1[i1][1];
            jg[2][i][1] = is2 - 1 + j2[i1][1];
            jg[3][i][1] = is3 - 1 + j3[i1][1];
            i1 = i1-1;
	} else {
            jg[0][i][1] = 0;
            jg[1][i][1] = 0;
            jg[2][i][1] = 0;
            jg[3][i][1] = 0;
	}
	ten[i][1] = best;
      j123 = j1[i0][0] + n1*(j2[i0][0] + n2*j3[i0][0]);
	best = z[j123];
	if (best == z[j123]) {
            jg[0][i][0] = 0;
            jg[1][i][0] = is1 - 1 + j1[i0][0];
            jg[2][i][0] = is2 - 1 + j2[i0][0];
            jg[3][i][0] = is3 - 1 + j3[i0][0];
            i0 = i0-1;
	} else {
            jg[0][i][0] = 0;
            jg[1][i][0] = 0;
            jg[2][i][0] = 0;
            jg[3][i][0] = 0;
	}
	ten[i][0] = best;
    }
    m1 = i1+1;
    m0 = i0+1;

/*    printf(" negative charges at");
    for (i = 0; i < MM; i++) {
	if (i%5 == 0) printf("\n");
	printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]);
    }
    printf("\n positive charges at");
    for (i = 0; i < MM; i++) {
	if (i%5 == 0) printf("\n");
	printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]);
    }
    printf("\n small random numbers were\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %15.8e", ten[i][0]);
    }
    printf("\n and they were found on processor number\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %4d", jg[0][i][0]);
    }
    printf("\n large random numbers were\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %15.8e", ten[i][1]);
    }
    printf("\n and they were found on processor number\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %4d", jg[0][i][1]);
    }
    printf("\n");*/

#if 0
#pragma omp parallel for private(i2, i1)    
for (i3 = 0; i3 < n3; i3++) {
  for (i2 = 0; i2 < n2; i2++) {
    for (i1 = 0; i1 < n1; i1++) {
      int i123 = i1 + n1*(i2+n2*i3);
      z[i123] = 0.0;
    }
  }
 }
#else
#pragma omp parallel
    {
      zero3(z,n1,n2,n3);
    }
#endif

#pragma acc parallel present(z[0:n3*n2*n1]) copyin(jg)
{
#pragma acc loop
    for (i = MM-1; i >= m0; i--) {
      int j123 = j1[i][0] + n1*(j2[i][0] + n2*j3[i][0]);
	z[j123] = -1.0;
    }
#pragma acc loop
    for (i = MM-1; i >= m1; i--) {
      int j123 = j1[i][1] + n1*(j2[i][1] + n2*j3[i][1]);
	z[j123] = 1.0;
    }
} // end acc parallel                                                         
#pragma omp parallel    
    comm3(z,n1,n2,n3,k);

/*--------------------------------------------------------------------
c          call showall(z,n1,n2,n3)
c-------------------------------------------------------------------*/
}
Exemplo n.º 4
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode,
	no_large_nodes, np_add, k_offset, j;
    int nthreads = 1;
    boolean verified;
    char size[13+1];	/* character*13 */

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/

    printf("\n\n NAS Parallel Benchmarks 3.0 structured OpenMP C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    
#pragma omp parallel for default(shared) private(i)
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
	t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;

    for ( i = 0; i <= NQ - 1; i++) {
	q[i] = 0.0;
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

#pragma omp parallel copyin(x)
{
    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double qq[NQ];		/* private copy of q[0:NQ-1] */

    for (i = 0; i < NQ; i++) qq[i] = 0.0;

#pragma omp for reduction(+:sx,sy) schedule(static)  
    for (k = 1; k <= np; k++) {
	kk = k_offset + k;
	t1 = S;
	t2 = an;

/*      Find starting seed t1 for this kk. */

	for (i = 1; i <= 100; i++) {
            ik = kk / 2;
            if (2 * ik != kk) t3 = randlc(&t1, t2);
            if (ik == 0) break;
            t3 = randlc(&t2, t2);
            kk = ik;
	}

/*      Compute uniform pseudorandom numbers. */

	if (TIMERS_ENABLED == TRUE) timer_start(3);
	vranlc(2*NK, &t1, A, x-1);
	if (TIMERS_ENABLED == TRUE) timer_stop(3);

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
	if (TIMERS_ENABLED == TRUE) timer_start(2);

	for ( i = 0; i < NK; i++) {
            x1 = 2.0 * x[2*i] - 1.0;
            x2 = 2.0 * x[2*i+1] - 1.0;
            t1 = pow2(x1) + pow2(x2);
            if (t1 <= 1.0) {
		t2 = sqrt(-2.0 * log(t1) / t1);
		t3 = (x1 * t2);				/* Xi */
		t4 = (x2 * t2);				/* Yi */
		l = max(fabs(t3), fabs(t4));
		qq[l] += 1.0;				/* counts */
		sx = sx + t3;				/* sum of Xi */
		sy = sy + t4;				/* sum of Yi */
            }
	}
	if (TIMERS_ENABLED == TRUE) timer_stop(2);
    }
#pragma omp critical
    {
      for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
    }
#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
} /* end of parallel region */    

    for (i = 0; i <= NQ-1; i++) {
        gc = gc + q[i];
    }

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads,
		  tm, Mops, 	
		  "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    if (TIMERS_ENABLED == TRUE) {
	printf("Total time:     %f", timer_read(1));
	printf("Gaussian pairs: %f", timer_read(2));
	printf("Random numbers: %f", timer_read(3));
    }
}
Exemplo n.º 5
0
int main() 
{
  double Mops, t1, t2, t3, t4, x1, x2;
  double sx, sy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    np;
  int    i, ik, kk, l, k, nit;
  int    k_offset, j;
  logical verified, timers_enabled;

  double dum[3] = {1.0, 1.0, 1.0};
  char   size[16];

  FILE *fp;

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  //--------------------------------------------------------------------
  //  Call the random number generator functions and initialize
  //  the x-array to reduce the effects of paging on the timings.
  //  Also, call all mathematical functions that are used. Make
  //  sure these initializations cannot be eliminated as dead code.
  //--------------------------------------------------------------------

  vranlc(0, &dum[0], dum[1], &dum[2]);
  dum[0] = randlc(&dum[1], dum[2]);
  for (i = 0; i < 2 * NK; i++) {
    x[i] = -1.0e99;
  }
  Mops = log(sqrt(fabs(MAX(1.0, 1.0))));   

  timer_clear(0);
  timer_clear(1);
  timer_clear(2);
  timer_start(0);

  t1 = A;
  vranlc(0, &t1, A, x);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;
  gc = 0.0;
  sx = 0.0;
  sy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  for (k = 1; k <= np; k++) {
    kk = k_offset + k; 
    t1 = S;
    t2 = an;

    // Find starting seed t1 for this kk.

    for (i = 1; i <= 100; i++) {
      ik = kk / 2;
      if ((2 * ik) != kk) t3 = randlc(&t1, t2);
      if (ik == 0) break;
      t3 = randlc(&t2, t2);
      kk = ik;
    }

    //--------------------------------------------------------------------
    //  Compute uniform pseudorandom numbers.
    //--------------------------------------------------------------------
    if (timers_enabled) timer_start(2);
    vranlc(2 * NK, &t1, A, x);
    if (timers_enabled) timer_stop(2);

    //--------------------------------------------------------------------
    //  Compute Gaussian deviates by acceptance-rejection method and 
    //  tally counts in concentri//square annuli.  This loop is not 
    //  vectorizable. 
    //--------------------------------------------------------------------
    if (timers_enabled) timer_start(1);

    for (i = 0; i < NK; i++) {
      x1 = 2.0 * x[2*i] - 1.0;
      x2 = 2.0 * x[2*i+1] - 1.0;
      t1 = x1 * x1 + x2 * x2;
      if (t1 <= 1.0) {
        t2   = sqrt(-2.0 * log(t1) / t1);
        t3   = (x1 * t2);
        t4   = (x2 * t2);
        l    = MAX(fabs(t3), fabs(t4));
        q[l] = q[l] + 1.0;
        sx   = sx + t3;
        sy   = sy + t4;
      }
    }

    if (timers_enabled) timer_stop(1);
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((sx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((sy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", sx, sy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, CS1,
      CS2, CS3, CS4, CS5, CS6, CS7);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(1);
    printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(2);
    printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  return 0;
}
Exemplo n.º 6
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double *x, **xx, *q, **qq;

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    const int TRANSFER_X = 1;
    int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode,
    no_large_nodes, np_add, k_offset, j;
    double loc_x,loc_t1,loc_t2,loc_t3,loc_t4;
    double loc_a1,loc_a2,loc_x1,loc_x2,loc_z;
    boolean verified;
    char size[13+1];	/* character*13 */
    
/*     Allocate working memory       */

    x = (double*) malloc(sizeof(double) * 2*NK);
    xx = (double**) malloc(sizeof(double*) * NN);
    xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK);
    for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK);
    q = (double*) malloc(sizeof(double) * NQ);
    qq = (double**) malloc(sizeof(double*) * NN);
    qq[0] = (double*) malloc(sizeof(double) * NN * NQ);
    for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ;

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
#pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \
    copyout(q[0:NQ])
{
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);
    #pragma acc update device(x[0:2*NK])

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
      t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;
    
    #pragma acc parallel loop
    for (k = 0; k < np; k++) {
      /* Initialize private q (qq) */
      #pragma acc loop
      for (i = 0; i < NQ; i++)
          qq[k][i] = 0.0;
      /* Initialize private x (xx)  */
      #pragma acc loop
      for (i = 0; i < 2*NK; i++)
          xx[k][i] = x[i];
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double psx, psy;

    #pragma acc parallel loop reduction(+:sx,sy)
    for (k = 1; k <= np; k++) {
      kk = k_offset + k;
      t1 = S;
      t2 = an;

/*      Find starting seed t1 for this kk. */

      #pragma acc loop seq
      for (i = 1; i <= 100; i++) {
          ik = kk / 2;
          if (2 * ik != kk) t3 = RANDLC(&t1, t2);
          if (ik == 0) break;
          t3 = RANDLC(&t2, t2);
          kk = ik;
      }

/*      Compute uniform pseudorandom numbers. */

      loc_t1 = r23 * A;
      loc_a1 = (int)loc_t1;
      loc_a2 = A - t23 * loc_a1;
      loc_x = t1;

      #pragma acc loop seq
      for (i = 1; i <= 2*NK; i++) {
          loc_t1 = r23 * loc_x;
          loc_x1 = (int)loc_t1;
          loc_x2 = loc_x - t23 * loc_x1;
          loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1;
          loc_t2 = (int)(r23 * loc_t1);
          loc_z = loc_t1 - t23 * loc_t2;
          loc_t3 = t23 * loc_z + loc_a2 * loc_x2;
          loc_t4 = (int)(r46 * loc_t3);
          loc_x = loc_t3 - t46 * loc_t4;
          xx[k-1][i-1] = r46 * loc_x;
      }
      t1 = loc_x;

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
 
      psx = psy = 0.0;

      #pragma acc loop reduction(+:psx,psy)
      for ( i = 0; i < NK; i++) {
          x1 = 2.0 * xx[k-1][2*i] - 1.0;
          x2 = 2.0 * xx[k-1][2*i+1] - 1.0;
          t1 = pow2(x1) + pow2(x2);
          if (t1 <= 1.0) {
            t2 = sqrt(-2.0 * log(t1) / t1);
            t3 = (x1 * t2);             /* Xi */
            t4 = (x2 * t2);             /* Yi */
            l = max(fabs(t3), fabs(t4));
            qq[k-1][l] += 1.0;                      /* counts */
            psx = psx + t3;  /* sum of Xi */
            psy = psy + t4;               /* sum of Yi */
          }
      }

      sx += psx;
      sy += psy;
      
    }
    
/*      Reduce private qq to q          */
    #pragma acc parallel loop reduction(+:gc)
    for ( i = 0; i < NQ; i++ ) {
      double sumq = 0.0;
      #pragma acc loop reduction(+:sumq)
      for (k = 0; k < np; k++)
          sumq = sumq + qq[k][i];
      q[i] = sumq;
      gc += sumq;
    }

} /* end acc data */

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit,
          tm, Mops, "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    return 0;
}
Exemplo n.º 7
0
static void
__ompc_func_3 (void **__ompc_args)
{
  auto double *_pp_sx;
  auto double *_pp_sy;
  auto int *_pp_np;
  auto int *_pp_k_offset;
  auto double *_pp_an;
  auto int *_pp_nthreads;
  auto double *_ppthd_x;
  (_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x))));
  (_pp_sx) = (((double *) (*__ompc_args)));
  (_pp_sy) = (((double *) (*((__ompc_args) + (1)))));
  (_pp_np) = (((int *) (*((__ompc_args) + (2)))));
  (_pp_k_offset) = (((int *) (*((__ompc_args) + (3)))));
  (_pp_an) = (((double *) (*((__ompc_args) + (4)))));
  (_pp_nthreads) = (((int *) (*((__ompc_args) + (5)))));
  _ompc_copyin_thdprv (_ppthd_x, x, 1048576);
  {
    auto double t1;
    auto double t2;
    auto double t3;
    auto double t4;
    auto double x1;
    auto double x2;
    auto int kk;
    auto int i;
    auto int ik;
    auto int l;
    auto double qq[10];

# 150 "ep.c"
    for ((i) = (0); (i) < (10); (i)++)
      {
	(*((qq) + (i))) = (0.0);
      }
    {
      auto double _p_sx;
      auto double _p_sy;
      auto int _p_k;
      auto int _p_k_0;
      auto int _p_k_1;
      auto int _p_k_2;
      (_p_sy) = (0.0);
      (_p_sx) = (0.0);
      (_p_k_0) = (1);
      (_p_k_1) = ((*_pp_np) + (1));
      (_p_k_2) = (1);
      _ompc_static_bsched (&_p_k_0, &_p_k_1, &_p_k_2);
# 153 "ep.c"
      for ((_p_k) = (_p_k_0); (_p_k) < (_p_k_1); (_p_k) += (_p_k_2))
	{

# 154 "ep.c"
	  (kk) = ((*_pp_k_offset) + (_p_k));
# 155 "ep.c"
	  (t1) = (2.71828183E8);
# 156 "ep.c"
	  (t2) = (*_pp_an);
# 160 "ep.c"
	  for ((i) = (1); (i) <= (100); (i)++)
	    {

# 161 "ep.c"
	      (ik) = ((kk) / (2));
# 162 "ep.c"
	      if (((2) * (ik)) != (kk))
		{
		  (t3) = (randlc (&(t1), t2));
		}
# 163 "ep.c"
	      if ((ik) == (0))
# 163 "ep.c"
		break;
# 164 "ep.c"
	      (t3) = (randlc (&(t2), t2));
# 165 "ep.c"
	      (kk) = (ik);
	    }
# 170 "ep.c"
	  if ((0) == (1))
	    {
	      timer_start (3);
	    }
# 171 "ep.c"
	  vranlc ((2) * ((1) << (16)), &(t1), 1.220703125E9,
		  (_ppthd_x) - (1));
# 172 "ep.c"
	  if ((0) == (1))
	    {
	      timer_stop (3);
	    }
# 179 "ep.c"
	  if ((0) == (1))
	    {
	      timer_start (2);
	    }
# 181 "ep.c"
	  for ((i) = (0); (i) < ((1) << (16)); (i)++)
	    {

# 182 "ep.c"
	      (x1) = (((2.0) * (*((_ppthd_x) + ((2) * (i))))) - (1.0));
# 183 "ep.c"
	      (x2) =
		(((2.0) * (*((_ppthd_x) + (((2) * (i)) + (1))))) - (1.0));
# 184 "ep.c"
	      (t1) = (((x1) * (x1)) + ((x2) * (x2)));
# 185 "ep.c"
	      if ((t1) <= (1.0))
		{

# 186 "ep.c"
		  (t2) = (sqrt (((-(2.0)) * (log (t1))) / (t1)));
# 187 "ep.c"
		  (t3) = ((x1) * (t2));
# 188 "ep.c"
		  (t4) = ((x2) * (t2));
# 189 "ep.c"
		  (l) =
		    (((int)
		      (((fabs (t3)) >
			(fabs (t4))) ? (fabs (t3)) : (fabs (t4)))));
# 190 "ep.c"
		  (*((qq) + (l))) += (1.0);
# 191 "ep.c"
		  (_p_sx) = ((_p_sx) + (t3));
# 192 "ep.c"
		  (_p_sy) = ((_p_sy) + (t4));
		}
	    }
# 195 "ep.c"
	  if ((0) == (1))
	    {
	      timer_stop (2);
	    }
	}
      _ompc_reduction (&_p_sy, _pp_sy, 14, 6);
      _ompc_reduction (&_p_sx, _pp_sx, 14, 6);
      _ompc_barrier ();
    }
    {
      _ompc_enter_critical (&__ompc_lock_critical);
# 199 "ep.c"
      for ((i) = (0); (i) <= ((10) - (1)); (i)++)
	{
	  (*((q) + (i))) += (*((qq) + (i)));
	}
      _ompc_exit_critical (&__ompc_lock_critical);
    }
    if (_ompc_is_master ())
      {
	(*_pp_nthreads) = (omp_get_num_threads ());
      }
  }
}
Exemplo n.º 8
0
int
main (int argc, char **argv)
{
  //auto double *_ppthd_x;
  auto double Mops;
  auto double t1;
  auto double t2;
  auto double t3;
  auto double t4;
  auto double x1;
  auto double x2;
  auto double sx;
  auto double sy;
  auto double tm;
  auto double an;
  auto double tt;
  auto double gc;
  auto double dum[3];
  auto int np;
  auto int ierr;
  auto int node;
  auto int no_nodes;
  auto int i;
  auto int ik;
  auto int kk;
  auto int l;
  auto int k;
  auto int nit;
  auto int ierrcode;
  auto int no_large_nodes;
  auto int np_add;
  auto int k_offset;
  auto int j;
  auto int nthreads;
  auto int verified;
  auto char size[14];
 int status = 0;
  _ompc_init(argc,argv);

  //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x))));
  (*(dum)) = (1.0);
  (*((dum) + (1))) = (1.0);
  (*((dum) + (2))) = (1.0);
  (nthreads) = (1);
# 84 "ep.c"
  printf
    ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012");
# 86 "ep.c"
  sprintf (size, "%12.0f", pow (2.0, (28) + (1)));
# 87 "ep.c"
  for ((j) = (13); (j) >= (1); (j)--)
    {

# 88 "ep.c"
      if ((((int) (*((size) + (j))))) == (46))
	{
	  (*((size) + (j))) = (((char) (32)));
	}
    }
# 90 "ep.c"
  printf (" Number of random numbers generated: %13s\012", size);
# 92 "ep.c"
  (verified) = (0);
# 99 "ep.c"
  (np) = ((1) << ((28) - (16)));
# 107 "ep.c"
  vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2));
# 108 "ep.c"
  (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2))));
# 109 "ep.c"
  for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++)
    {
      x[i] = (-(1.0E99));
      //(*((_ppthd_x) + (i))) = (-(1.0E99));
    }
# 110 "ep.c"
  (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0)))));
# 112 "ep.c"
  timer_clear (1);
# 113 "ep.c"
  timer_clear (2);
# 114 "ep.c"
  timer_clear (3);
# 115 "ep.c"
  timer_start (1);
# 117 "ep.c"
  vranlc (0, &(t1), 1.220703125E9, x);
  //vranlc (0, &(t1), 1.220703125E9, _ppthd_x);
# 121 "ep.c"
  (t1) = (1.220703125E9);
# 123 "ep.c"
  for ((i) = (1); (i) <= ((16) + (1)); (i)++)
    {

# 124 "ep.c"
      (t2) = (randlc (&(t1), t1));
    }
# 127 "ep.c"
  (an) = (t1);
# 128 "ep.c"
  (tt) = (2.71828183E8);
# 129 "ep.c"
  (gc) = (0.0);
# 130 "ep.c"
  (sx) = (0.0);
# 131 "ep.c"
  (sy) = (0.0);
# 133 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 134 "ep.c"
      (*((q) + (i))) = (0.0);
    }
# 142 "ep.c"
  (k_offset) = (-(1));
  {
    auto void *__ompc_argv[6];
    (*(__ompc_argv)) = (((void *) (&sx)));
    (*((__ompc_argv) + (1))) = (((void *) (&sy)));
    (*((__ompc_argv) + (2))) = (((void *) (&np)));
    (*((__ompc_argv) + (3))) = (((void *) (&k_offset)));
    (*((__ompc_argv) + (4))) = (((void *) (&an)));
    (*((__ompc_argv) + (5))) = (((void *) (&nthreads)));
    _ompc_do_parallel (__ompc_func_3, __ompc_argv);
  }
# 207 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 208 "ep.c"
      (gc) = ((gc) + (*((q) + (i))));
    }
# 211 "ep.c"
  timer_stop (1);
# 212 "ep.c"
  (tm) = (timer_read (1));
# 214 "ep.c"
  (nit) = (0);
# 215 "ep.c"
  if ((28) == (24))
    {

# 216 "ep.c"
      if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8)))
	{

# 218 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 220 "ep.c"
  if ((28) == (25))
    {

# 221 "ep.c"
      if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8)))
	{

# 223 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 225 "ep.c"
  if ((28) == (28))
    {

# 226 "ep.c"
      if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8)))
	{

# 228 "ep.c"
	  (verified) = (1);
          printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy);
       }
     }

  else
# 230 "ep.c"
  if ((28) == (30))
    {

# 231 "ep.c"
      if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8)))
	{

# 233 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 235 "ep.c"
  if ((28) == (32))
    {

# 236 "ep.c"
      if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8)))
	{

# 238 "ep.c"
	  (verified) = (1);
	}
    }
# 242 "ep.c"
  (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0));
# 244 "ep.c"
  printf
    ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012",
     tm, 28, gc, sx, sy);
# 251 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 252 "ep.c"
      printf ("%3d %15.0f\012", i, *((q) + (i)));
    }
# 255 "ep.c"
  c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops,
		   "Random numbers generated", verified, "2.3", "07 Aug 2006",
		   "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm",
		   "randdp");
# 261 "ep.c"
  if ((0) == (1))
    {

# 262 "ep.c"
      printf ("Total time:     %f", timer_read (1));
# 263 "ep.c"
      printf ("Gaussian pairs: %f", timer_read (2));
# 264 "ep.c"
      printf ("Random numbers: %f", timer_read (3));
    }
}