Esempio n. 1
void	create_seq( int myId )
  double seed = 314159265.00;
  double a    = 1220703125.00;
	double x, mySeed;
	INT_TYPE i, k, chunk;
  int ini, fim;

  ini = chunk * myId;
  fim = ini + chunk;
  if ( fim > NUM_KEYS ) {
    fim = NUM_KEYS;

  mySeed = find_my_seed( myId, NUM_THREADS, (long)4*NUM_KEYS, seed, a );

  k = MAX_KEY/4;

	for (i = ini; i < fim; i++)
	    x = randlc(&mySeed, &a);
	    x += randlc(&mySeed, &a);
      x += randlc(&mySeed, &a);
	    x += randlc(&mySeed, &a);  
      key_array[i] = k*x;
Esempio n. 2
File: is.c Progetto: 8l/insieme
double   find_my_seed( int kn,        /* my processor rank, 0<=kn<=num procs */
                       int np,        /* np = num procs                      */
                       long nn,       /* total num of ran numbers, all procs */
                       double s,      /* Ran num seed, for ex.: 314159265.00 */
                       double a )     /* Ran num gen mult, try 1220703125.00 */

      double t1,t2;
      long   mq,nq,kk,ik;

      if ( kn == 0 ) return s;

      mq = (nn/4 + np - 1) / np;
      nq = mq * 4 * kn;               /* number of rans to be skipped */

      t1 = s;
      t2 = a;
      kk = nq;
      while ( kk > 1 ) {
      	 ik = kk / 2;
         if( 2 * ik ==  kk ) {
            (void)randlc( &t2, &t2 );
	    kk = ik;
	 else {
            (void)randlc( &t1, &t2 );
	    kk = kk - 1;
      (void)randlc( &t1, &t2 );

      return( t1 );

Esempio n. 3
// compute a^exponent mod 2^46
static double ipow46(double a, int exponent)
  double result, dummy, q, r;
  int n, n2;

  // Use
  //   a^n = a^(n/2)*a^(n/2) if n even else
  //   a^n = a*a^(n-1)       if n odd
  result = 1;
  if (exponent == 0) return result;
  q = a;
  r = 1;
  n = exponent;

  while (n > 1) {
    n2 = n / 2;
    if (n2 * 2 == n) {
      dummy = randlc(&q, q);
      n = n2;
    } else {
      dummy = randlc(&r, q);
      n = n-1;
  dummy = randlc(&r, q);
  result = r;
  return result;
Esempio n. 4
// generate a sparse n-vector (v, iv)
// having nzv nonzeros
// mark(i) is set to 1 if position i is nonzero.
// mark is all zero on entry and is reset to all zero before exit
// this corrects a performance bug found by John G. Lewis, caused by
// reinitialization of mark on every one of the n calls to sprnvc
static void sprnvc(int n, int nz, int nn1, double v[], int iv[])
  int nzv, ii, i;
  double vecelt, vecloc;

  nzv = 0;

  while (nzv < nz) {
    vecelt = randlc(&tran, amult);

    // generate an integer between 1 and n in a portable manner
    vecloc = randlc(&tran, amult);
    i = icnvrt(vecloc, nn1) + 1;
    if (i > n) continue;

    // was this integer generated already?
    logical was_gen = false;
    for (ii = 0; ii < nzv; ii++) {
      if (iv[ii] == i) {
        was_gen = true;
    if (was_gen) continue;
    v[nzv] = vecelt;
    iv[nzv] = i;
    nzv = nzv + 1;
static double power( double a, int n ) {


c     power  raises an integer, disguised as a double
c     precision real, to an integer power
    double aj;
    int nj;
    double rdummy;
    double power;

    power = 1.0;
    nj = n;
    aj = a;

    while (nj != 0) {
	if( (nj%2) == 1 ) rdummy =  randlc( &power, aj );
	rdummy = randlc( &aj, aj );
	nj = nj/2;
    return (power);
Esempio n. 6
File: cg.c Progetto: 8l/rose
c       generate a sparse n-vector (v, iv)
c       having nzv nonzeros
c       mark(i) is set to 1 if position i is nonzero.
c       mark is all zero on entry and is reset to all zero before exit
c       this corrects a performance bug found by John G. Lewis, caused by
c       reinitialization of mark on every one of the n calls to sprnvc
static void sprnvc(
    int n,
    int nz,
    double v[],		/* v[1:*] */
    int iv[],		/* iv[1:*] */
    int nzloc[],	/* nzloc[1:n] */
    int mark[] ) 	/* mark[1:n] */
    int nn1;
    int nzrow, nzv, ii, i;
    double vecelt, vecloc;

    nzv = 0;
    nzrow = 0;
    nn1 = 1;
    do {
	nn1 = 2 * nn1;
    } while (nn1 < n);

c    nn1 is the smallest power of two not less than n

    while (nzv < nz) {
	vecelt = randlc(&tran, amult);

c   generate an integer between 1 and n in a portable manner
	vecloc = randlc(&tran, amult);
	i = icnvrt(vecloc, nn1) + 1;
	if (i > n) continue;

c  was this integer generated already?
	if (mark[i] == 0) {
	    mark[i] = 1;
	    nzrow = nzrow + 1;
	    nzloc[nzrow] = i;
	    nzv = nzv + 1;
	    v[nzv] = vecelt;
	    iv[nzv] = i;

    for (ii = 1; ii <= nzrow; ii++) {
	i = nzloc[ii];
	mark[i] = 0;
Esempio n. 7
double Xi1double(double x, int N = 100) // default argument: N = 100
  cout << setiosflags(ios::uppercase);

  //  int N = 100;
  double xi,xk,a;
  //  double b[100],c[100];
  static double pi = 4.0 * atan(1.0);
  double T = 30.0;
  double f = 2.0 * pi/T;
  double* const b = new double [N];
  double* const c = new double [N];
  int j; 

  xk = 1.0;
  double x1 = 3.0;
  a = 3.0;
  xi = 0.0;

  for(int i = 0; i < N; i++)
      j = N  - i - 1 ;
      b[j] = randlc(xk,a);
      c[j] = randlc(x1,a);
  for(i = 0; i < N; i++)
      xi += b[i] * cos(i*f*x) + c[i] * sin(i*f*x);
      if(fabs(xi) > 10.0)
	{ xi = xi/100.0;}
      else { xi = xi;} 

  if(fabs(xi) > 1.0)
    {xi = xi/10.0;}
    { xi = xi ;}
  delete [] b;
  delete [] c;
  return xi;
Esempio n. 8
void	create_seq( double seed, double a )
    double x;
    int    i, k;

    k = MAX_KEY/4;

    for (i=0; i<NUM_KEYS; i++)
        x = randlc(&seed, &a);
        x += randlc(&seed, &a);
        x += randlc(&seed, &a);
        x += randlc(&seed, &a);

        key_array[i] = k*x;
Esempio n. 9
double   find_my_seed( int  kn,       /* my processor rank, 0<=kn<=num procs */
                       int  np,       /* np = num procs                      */
                       long nn,       /* total num of ran numbers, all procs */
                       double s,      /* Ran num seed, for ex.: 314159265.00 */
                       double a )     /* Ran num gen mult, try 1220703125.00 */

    long   i;

    double t1,t2,t3,an;
    long   mq,nq,kk,ik;

    nq = nn / np;

    for( mq=0; nq>1; mq++,nq/=2 )

    t1 = a;

    for( i=1; i<=mq; i++ )
        t2 = randlc( &t1, &t1 );

    an = t1;

    kk = kn;
    t1 = s;
    t2 = an;

    for( i=1; i<=100; i++ )
        ik = kk / 2;
        if( 2 * ik !=  kk )
            t3 = randlc( &t1, &t2 );
        if( ik == 0 )
        t3 = randlc( &t2, &t2 );
        kk = ik;

    return( t1 );

Esempio n. 10
void test04 ( void )




    This code is distributed under the GNU LGPL license. 


    12 March 2010


    John Burkardt
  int i;
  int k;
  int klog;
  double seed;
  double x1;
  double x2;

  printf ( "\n" );
  printf ( "RANDLC_TEST04\n" );
  printf ( "  RANDLC_JUMP jumps directly to the K-th value\n" );
  printf ( "  returned by RANDLC.\n" );
  printf ( "\n" );
  printf ( "         K X(hard way)     X(jump)\n" );
  printf ( "\n" );

  k = 1;

  for ( klog = 1; klog <= 10; klog++ )
    seed = 123456789.0;
    for ( i = 1; i <= k; i++ )
      x1 = randlc ( &seed );

    seed = 123456789.0;
    x2 = randlc_jump ( seed, k );

    printf ( "  %8d  %10f  %10f\n", k, x1, x2 );

    k = k * 2;

Esempio n. 11
File: is.c Progetto: 8l/insieme
void	create_seq( double seed, double a )
  double x, s;
  INT_TYPE i, k;

#pragma omp parallel private(x,s,i,k)
    INT_TYPE k1, k2;
    double an = a;
    int myid, num_procs;
    INT_TYPE mq;

#ifdef _OPENMP
    myid = omp_get_thread_num();
    num_procs = omp_get_num_threads();
    myid = 0;
    num_procs = 1;

    mq = (NUM_KEYS + num_procs - 1) / num_procs;
    k1 = mq * myid;
    k2 = k1 + mq;
    if ( k2 > NUM_KEYS ) k2 = NUM_KEYS;

    KS = 0;
    s = find_my_seed( myid, num_procs,
        (long)4*NUM_KEYS, seed, an );

    k = MAX_KEY/4;

    for (i=k1; i<k2; i++)
      x = randlc(&s, &an);
      x += randlc(&s, &an);
      x += randlc(&s, &an);
      x += randlc(&s, &an);

      key_array[i] = k*x;
  } /*omp parallel*/
Esempio n. 12
static void compute_initial_conditions(dcomplex u0[NZ][NY][NX], int d[3]) {


c Fill in array u0 with initial conditions from 
c random number generator 

    int k;
    double x0, start, an, dummy;
    static double tmp[NX*2*MAXDIM+1];
    int i,j,t;
    start = SEED;
c Jump to the starting element for our first plane.
    ipow46(A, (zstart[0]-1)*2*NX*NY + (ystart[0]-1)*2*NX, &an);
    dummy = randlc(&start, an);
    ipow46(A, 2*NX*NY, &an);
c Go through by z planes filling in one square at a time.
    for (k = 0; k < dims[0][2]; k++) {
	x0 = start;
        vranlc(2*NX*dims[0][1], &x0, A, tmp);
	t = 1;
	for (j = 0; j < dims[0][1]; j++)
	  for (i = 0; i < NX; i++) {
	    u0[k][j][i].real = tmp[t++];
	    u0[k][j][i].imag = tmp[t++];
        if (k != dims[0][2]) dummy = randlc(&start, an);
Esempio n. 13
int GetFeatureNum(char *mbname,int id){
  double tran=314159265.0;
  double A=2*id+1;
  double denom=randlc(&tran,&A);
  char cval='S';
  int mean=NUM_SAMPLES,stdev=128;
  int rtfs=0,len=0;
  rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
  if(rtfs<0) rtfs=-rtfs;
  return len;
Esempio n. 14
static void ipow46(double a, int exponent, double *result) {


c compute a^exponent mod 2^46

    double dummy, q, r;
    int n, n2;

c Use
c   a^n = a^(n/2)*a^(n/2) if n even else
c   a^n = a*a^(n-1)       if n odd
    *result = 1;
    if (exponent == 0) return;
    q = a;
    r = 1;
    n = exponent;

    while (n > 1) {
	n2 = n/2;
	if (n2 * 2 == n) {
            dummy = randlc(&q, q);
            n = n2;
	} else {
            dummy = randlc(&r, q);
            n = n-1;
    dummy = randlc(&r, q);
    *result = r;
Esempio n. 15
void test01 ( void )


    TEST01 tests RANDLC.


    This code is distributed under the GNU LGPL license. 


    08 March 2010


    John Burkardt
  int i;
  double seed;
  double seed_init = 123456789.0;

  printf ( "\n" );
  printf ( "TEST01\n" );
  printf ( "  RANDLC computes pseudorandom values \n" );
  printf ( "  in the interval [0,1].\n" );

  seed = seed_init;

  printf ( "\n" );
  printf ( "  The initial seed is %14.0f\n", seed_init );
  printf ( "\n" );
  printf ( "         I          RANDLC\n" );
  printf ( "\n" );

  for ( i = 1; i <= 10; i++ )
    printf ( "  %8d  %14f\n", i, randlc ( &seed ) );

Esempio n. 16
static void zran3(double *z, int n1, int n2, int n3, int nx, int ny, int k) {


c     zran3  loads +1 at ten randomly chosen points,
c     loads -1 at a different ten random points,
c     and zero elsewhere.

#define MM	10
#define	A	pow(5.0,13)
#define	X	314159265.e0    
    int i0, m0, m1;
    int i1, i2, i3, d1, e1, e2, e3;
    double xx, x0, x1, a1, a2, ai;

    double ten[MM][2], best;
    int i, j1[MM][2], j2[MM][2], j3[MM][2];
    int jg[4][MM][2];

    double rdummy;

    a1 = power( A, nx );
    a2 = power( A, nx*ny );

#if 0
#pragma omp parallel
#pragma omp parallel for private(i2, i1)    
  for (i3 = 0;i3 < n3; i3++) {
    for (i2 = 0; i2 < n2; i2++) {
      for (i1 = 0; i1 < n1; i1++) {
	int i123 = i1 + n1*(i2 + n2*i3);
	z[i123] = 0.0;

    i = is1-1+nx*(is2-1+ny*(is3-1));

    ai = power( A, i );
    d1 = ie1 - is1 + 1;
    e1 = ie1 - is1 + 2;
    e2 = ie2 - is2 + 2;
    e3 = ie3 - is3 + 2;
    x0 = X;
    rdummy = randlc( &x0, ai );
    for (i3 = 1; i3 < e3; i3++) {
	x1 = x0;
	for (i2 = 1; i2 < e2; i2++) {
            xx = x1;
            vranlc( d1, &xx, A, &(z[0+n1*(i2 + n2*i3)]));
            rdummy = randlc( &x1, a1 );
	rdummy = randlc( &x0, a2 );

c       call comm3(z,n1,n2,n3)
c       call showall(z,n1,n2,n3)

c     each processor looks for twenty candidates
    for (i = 0; i < MM; i++) {
	ten[i][1] = 0.0;
	j1[i][1] = 0;
	j2[i][1] = 0;
	j3[i][1] = 0;
	ten[i][0] = 1.0;
	j1[i][0] = 0;
	j2[i][0] = 0;
	j3[i][0] = 0;
    for (i3 = 1; i3 < n3-1; i3++) {
	for (i2 = 1; i2 < n2-1; i2++) {
            for (i1 = 1; i1 < n1-1; i1++) {
	      int i123 = i1 + n1*(i2 + n2*i3);
		if ( z[i123] > ten[0][1] ) {
		    ten[0][1] = z[i123];
		    j1[0][1] = i1;
		    j2[0][1] = i2;
		    j3[0][1] = i3;
		    bubble( ten, j1, j2, j3, MM, 1 );
		if ( z[i123] < ten[0][0] ) {
		    ten[0][0] = z[i123];
		    j1[0][0] = i1;
		    j2[0][0] = i2;
		    j3[0][0] = i3;
		    bubble( ten, j1, j2, j3, MM, 0 );

c     Now which of these are globally best?
    i1 = MM - 1;
    i0 = MM - 1;
    for (i = MM - 1 ; i >= 0; i--) {
      int j123 = j1[i1][1] + n1*(j2[i1][1] + n2*j3[i1][1]);
	best = z[j123];
	if (best == z[j123]) {
            jg[0][i][1] = 0;
            jg[1][i][1] = is1 - 1 + j1[i1][1];
            jg[2][i][1] = is2 - 1 + j2[i1][1];
            jg[3][i][1] = is3 - 1 + j3[i1][1];
            i1 = i1-1;
	} else {
            jg[0][i][1] = 0;
            jg[1][i][1] = 0;
            jg[2][i][1] = 0;
            jg[3][i][1] = 0;
	ten[i][1] = best;
      j123 = j1[i0][0] + n1*(j2[i0][0] + n2*j3[i0][0]);
	best = z[j123];
	if (best == z[j123]) {
            jg[0][i][0] = 0;
            jg[1][i][0] = is1 - 1 + j1[i0][0];
            jg[2][i][0] = is2 - 1 + j2[i0][0];
            jg[3][i][0] = is3 - 1 + j3[i0][0];
            i0 = i0-1;
	} else {
            jg[0][i][0] = 0;
            jg[1][i][0] = 0;
            jg[2][i][0] = 0;
            jg[3][i][0] = 0;
	ten[i][0] = best;
    m1 = i1+1;
    m0 = i0+1;

/*    printf(" negative charges at");
    for (i = 0; i < MM; i++) {
	if (i%5 == 0) printf("\n");
	printf(" (%3d,%3d,%3d)", jg[1][i][0], jg[2][i][0], jg[3][i][0]);
    printf("\n positive charges at");
    for (i = 0; i < MM; i++) {
	if (i%5 == 0) printf("\n");
	printf(" (%3d,%3d,%3d)", jg[1][i][1], jg[2][i][1], jg[3][i][1]);
    printf("\n small random numbers were\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %15.8e", ten[i][0]);
    printf("\n and they were found on processor number\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %4d", jg[0][i][0]);
    printf("\n large random numbers were\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %15.8e", ten[i][1]);
    printf("\n and they were found on processor number\n");
    for (i = MM-1; i >= 0; i--) {
	printf(" %4d", jg[0][i][1]);

#if 0
#pragma omp parallel for private(i2, i1)    
for (i3 = 0; i3 < n3; i3++) {
  for (i2 = 0; i2 < n2; i2++) {
    for (i1 = 0; i1 < n1; i1++) {
      int i123 = i1 + n1*(i2+n2*i3);
      z[i123] = 0.0;
#pragma omp parallel

#pragma acc parallel present(z[0:n3*n2*n1]) copyin(jg)
#pragma acc loop
    for (i = MM-1; i >= m0; i--) {
      int j123 = j1[i][0] + n1*(j2[i][0] + n2*j3[i][0]);
	z[j123] = -1.0;
#pragma acc loop
    for (i = MM-1; i >= m1; i--) {
      int j123 = j1[i][1] + n1*(j2[i][1] + n2*j3[i][1]);
	z[j123] = 1.0;
} // end acc parallel                                                         
#pragma omp parallel    

c          call showall(z,n1,n2,n3)
Esempio n. 17
int main(int argc, char *argv[]) 
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;

  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number

  np = NN; 

  setup_opencl(argc, argv);


  //  Compute AN = A ^ (2 * NK) (mod 2^46).

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);

  an = t1;
  tt = S;

  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others

  k_offset = -1;


  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;

  // 9. Get the result
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];

  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);



  return 0;
Esempio n. 18
void test02 ( void )


    TEST02 tests RANDLC;


    This code is distributed under the GNU LGPL license. 


    08 March 2010


    John Burkardt
# define N 1000

  int i;
  double seed;
  double seed_in;
  double seed_out;
  double u[N];
  double u_avg;
  double u_var;

  printf ( "\n" );
  printf ( "TEST02\n" );
  printf ( "  RANDLC computes a sequence of uniformly distributed\n" );
  printf ( "  pseudorandom numbers.\n" );

  seed = 123456789.0;

  printf ( "\n" );
  printf ( "  Initial SEED = %14.0f\n", seed );

  printf ( "\n" );
  printf ( "  First 10 values:\n" );
  printf ( "\n" );
  printf ( "       I           Input          Output      RANDLC\n" );
  printf ( "                    SEED            SEED\n" );
  printf ( "\n" );

  for ( i = 0; i < 10; i++ )
    seed_in = seed;
    u[i] = randlc ( &seed );
    seed_out = seed;
    printf ( "  %6d  %14.0f  %14.0f  %10f\n", i + 1, seed_in, seed_out, u[i] );

  printf ( "\n" );
  printf ( "  Now call RANDLC %d times.\n", N );

  u_avg = 0.0;
  for ( i = 0; i < N; i++ )
    u[i] = randlc ( &seed );
    u_avg = u_avg + u[i];

  u_avg = u_avg / ( ( double ) N );

  u_var = 0.0;
  for ( i = 0; i < N; i++ )
    u_var = u_var + ( u[i] - u_avg ) * ( u[i] - u_avg );
  u_var = u_var / ( ( double ) ( N - 1 ) );

  printf ( "\n" );
  printf ( "  Average value = %f\n", u_avg );
  printf ( "  Expecting       %f\n", 0.5 );

  printf ( "\n" );
  printf ( "  Variance =      %f\n", u_var );
  printf ( "  Expecting       %f\n", 1.0 / 12.0 );

# undef N
Esempio n. 19
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
int main(int argc, char **argv) {

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode,
	no_large_nodes, np_add, k_offset, j;
    int nthreads = 1;
    boolean verified;
    char size[13+1];	/* character*13 */

c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)

    printf("\n\n NAS Parallel Benchmarks 3.0 structured OpenMP C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
    np = NN;

c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
#pragma omp parallel for default(shared) private(i)
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    Mops = log(sqrt(fabs(max(1.0, 1.0))));


    vranlc(0, &t1, A, x);

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
	t2 = randlc(&t1, t1);

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;

    for ( i = 0; i <= NQ - 1; i++) {
	q[i] = 0.0;
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
    k_offset = -1;

#pragma omp parallel copyin(x)
    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double qq[NQ];		/* private copy of q[0:NQ-1] */

    for (i = 0; i < NQ; i++) qq[i] = 0.0;

#pragma omp for reduction(+:sx,sy) schedule(static)  
    for (k = 1; k <= np; k++) {
	kk = k_offset + k;
	t1 = S;
	t2 = an;

/*      Find starting seed t1 for this kk. */

	for (i = 1; i <= 100; i++) {
            ik = kk / 2;
            if (2 * ik != kk) t3 = randlc(&t1, t2);
            if (ik == 0) break;
            t3 = randlc(&t2, t2);
            kk = ik;

/*      Compute uniform pseudorandom numbers. */

	if (TIMERS_ENABLED == TRUE) timer_start(3);
	vranlc(2*NK, &t1, A, x-1);
	if (TIMERS_ENABLED == TRUE) timer_stop(3);

c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
	if (TIMERS_ENABLED == TRUE) timer_start(2);

	for ( i = 0; i < NK; i++) {
            x1 = 2.0 * x[2*i] - 1.0;
            x2 = 2.0 * x[2*i+1] - 1.0;
            t1 = pow2(x1) + pow2(x2);
            if (t1 <= 1.0) {
		t2 = sqrt(-2.0 * log(t1) / t1);
		t3 = (x1 * t2);				/* Xi */
		t4 = (x2 * t2);				/* Yi */
		l = max(fabs(t3), fabs(t4));
		qq[l] += 1.0;				/* counts */
		sx = sx + t3;				/* sum of Xi */
		sy = sy + t4;				/* sum of Yi */
	if (TIMERS_ENABLED == TRUE) timer_stop(2);
#pragma omp critical
      for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
} /* end of parallel region */    

    for (i = 0; i <= NQ-1; i++) {
        gc = gc + q[i];

    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads,
		  tm, Mops, 	
		  "Random numbers generated",
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

	printf("Total time:     %f", timer_read(1));
	printf("Gaussian pairs: %f", timer_read(2));
	printf("Random numbers: %f", timer_read(3));
Esempio n. 20
void test03 ( void )


    TEST03 tests RANDLC.


    This code is distributed under the GNU LGPL license. 


    08 March 2010


    John Burkardt
  int i;
  double seed;
  double seed_in;
  double seed_out;
  double seed_save;
  double x;

  printf ( "\n" );
  printf ( "TEST03\n" );
  printf ( "  RANDLC computes a sequence of pseudorandom numbers\n" );
  printf ( "  but all computations depend on the seed value.\n" );
  printf ( "  In this test, we show how a sequence of \"random\"\n" );
  printf ( "  values can be manipulated by accessing the seed.\n" );

  seed = 1066.0;

  printf ( "\n" );
  printf ( "  Set SEED to %14.0f\n", seed );
  printf ( "\n" );
  printf ( "  Now call RANDLC 10 times, and watch SEED.\n" );
  printf ( "\n" );
  printf ( "       I           Input          Output      RANDLC\n" );
  printf ( "                    SEED            SEED\n" );
  printf ( "\n" );

  for ( i = 1; i <= 10; i++ )
    seed_in = seed;

    if ( i == 5 )
      seed_save = seed;
    x = randlc ( &seed );
    seed_out = seed;
    printf ( "  %6d  %14.0f  %14.0f  %10f\n", i, seed_in, seed_out, x );

  seed = seed_save;

  printf ( "\n" );
  printf ( "  Reset SEED to its value at step 5, = %14.0f\n", seed );
  printf ( "\n" );
  printf ( "  Now call RANDLC 10 times, and watch how SEED\n" );
  printf ( "  and RANDLC restart themselves.\n" );
  printf ( "\n" );
  printf ( "       I           Input          Output      RANDLC\n" );
  printf ( "                    SEED            SEED\n" );
  printf ( "\n" );

  for ( i = 1; i <= 10; i++ )
    seed_in = seed;
    x = randlc ( &seed );
    seed_out = seed;
    printf ( "  %6d  %14.0f  %14.0f  %10f\n", i, seed_in, seed_out, x );

  seed = 0.0;

  printf ( "\n" );
  printf ( "  What happens with an initial zero SEED?\n" );
  printf ( "\n" );
  printf ( "       I           Input          Output      RANDLC\n" );
  printf ( "                    SEED            SEED\n" );
  printf ( "\n" );

  for ( i = 1; i <= 10; i++ )
    seed_in = seed;
    x = randlc ( &seed );
    seed_out = seed;
    printf ( "  %6d  %14.0f  %14.0f  %10f\n", i, seed_in, seed_out, x );

  seed = -123456789.0;

  printf ( "\n" );
  printf ( "  What happens with an initial negative SEED?\n" );
  printf ( "\n" );
  printf ( "       I           Input          Output      RANDLC\n" );
  printf ( "                    SEED            SEED\n" );
  printf ( "\n" );

  for ( i = 1; i <= 10; i++ )
    seed_in = seed;
    x = randlc ( &seed );
    seed_out = seed;
    printf ( "  %6d  %14.0f  %14.0f  %10f\n", i, seed_in, seed_out, x );

Esempio n. 21
// Fill in array u0 with initial conditions from 
// random number generator 
static void compute_initial_conditions(cl_mem *u0, int d1, int d2, int d3)
  int k;
  double start, an, dummy, starts[NZ];
  size_t local_ws, global_ws, temp;
  cl_mem m_starts;
  cl_int ecode;

  start = SEED;
  // Jump to the starting element for our first plane.
  an = ipow46(A, 0);
  dummy = randlc(&start, an);
  an = ipow46(A, 2*NX*NY);

  starts[0] = start;
  for (k = 1; k < dims[2]; k++) {
    dummy = randlc(&start, an);
    starts[k] = start;

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_starts = clCreateBuffer(context,
                              CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                              sizeof(double) * NZ,
                              starts, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_starts");

    local_ws  = 1;
    global_ws = clu_RoundWorkSize((size_t)d2, local_ws);
  } else { //GPU
    m_starts = clCreateBuffer(context,
                              CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                              sizeof(double) * NZ,
    clu_CheckError(ecode, "clCreateBuffer() for m_starts");

    temp = d2 / max_compute_units;
    local_ws  = temp == 0 ? 
                1 : ((temp > work_item_sizes[0]) ? work_item_sizes[0] : temp);
    global_ws = clu_RoundWorkSize((size_t)d2, local_ws);

  ecode  = clSetKernelArg(k_compute_ics, 0, sizeof(cl_mem), u0);
  ecode |= clSetKernelArg(k_compute_ics, 1, sizeof(cl_mem), &m_starts);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  ecode = clEnqueueNDRangeKernel(cmd_queue,
                                 1, NULL,
                                 0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueNDRangeKernel()");

  ecode = clFinish(cmd_queue);
  clu_CheckError(ecode, "clFinish()");

Esempio n. 22
main (int argc, char **argv)
  //auto double *_ppthd_x;
  auto double Mops;
  auto double t1;
  auto double t2;
  auto double t3;
  auto double t4;
  auto double x1;
  auto double x2;
  auto double sx;
  auto double sy;
  auto double tm;
  auto double an;
  auto double tt;
  auto double gc;
  auto double dum[3];
  auto int np;
  auto int ierr;
  auto int node;
  auto int no_nodes;
  auto int i;
  auto int ik;
  auto int kk;
  auto int l;
  auto int k;
  auto int nit;
  auto int ierrcode;
  auto int no_large_nodes;
  auto int np_add;
  auto int k_offset;
  auto int j;
  auto int nthreads;
  auto int verified;
  auto char size[14];
 int status = 0;

  //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x))));
  (*(dum)) = (1.0);
  (*((dum) + (1))) = (1.0);
  (*((dum) + (2))) = (1.0);
  (nthreads) = (1);
# 84 "ep.c"
    ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012");
# 86 "ep.c"
  sprintf (size, "%12.0f", pow (2.0, (28) + (1)));
# 87 "ep.c"
  for ((j) = (13); (j) >= (1); (j)--)

# 88 "ep.c"
      if ((((int) (*((size) + (j))))) == (46))
	  (*((size) + (j))) = (((char) (32)));
# 90 "ep.c"
  printf (" Number of random numbers generated: %13s\012", size);
# 92 "ep.c"
  (verified) = (0);
# 99 "ep.c"
  (np) = ((1) << ((28) - (16)));
# 107 "ep.c"
  vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2));
# 108 "ep.c"
  (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2))));
# 109 "ep.c"
  for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++)
      x[i] = (-(1.0E99));
      //(*((_ppthd_x) + (i))) = (-(1.0E99));
# 110 "ep.c"
  (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0)))));
# 112 "ep.c"
  timer_clear (1);
# 113 "ep.c"
  timer_clear (2);
# 114 "ep.c"
  timer_clear (3);
# 115 "ep.c"
  timer_start (1);
# 117 "ep.c"
  vranlc (0, &(t1), 1.220703125E9, x);
  //vranlc (0, &(t1), 1.220703125E9, _ppthd_x);
# 121 "ep.c"
  (t1) = (1.220703125E9);
# 123 "ep.c"
  for ((i) = (1); (i) <= ((16) + (1)); (i)++)

# 124 "ep.c"
      (t2) = (randlc (&(t1), t1));
# 127 "ep.c"
  (an) = (t1);
# 128 "ep.c"
  (tt) = (2.71828183E8);
# 129 "ep.c"
  (gc) = (0.0);
# 130 "ep.c"
  (sx) = (0.0);
# 131 "ep.c"
  (sy) = (0.0);
# 133 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)

# 134 "ep.c"
      (*((q) + (i))) = (0.0);
# 142 "ep.c"
  (k_offset) = (-(1));
    auto void *__ompc_argv[6];
    (*(__ompc_argv)) = (((void *) (&sx)));
    (*((__ompc_argv) + (1))) = (((void *) (&sy)));
    (*((__ompc_argv) + (2))) = (((void *) (&np)));
    (*((__ompc_argv) + (3))) = (((void *) (&k_offset)));
    (*((__ompc_argv) + (4))) = (((void *) (&an)));
    (*((__ompc_argv) + (5))) = (((void *) (&nthreads)));
    _ompc_do_parallel (__ompc_func_3, __ompc_argv);
# 207 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)

# 208 "ep.c"
      (gc) = ((gc) + (*((q) + (i))));
# 211 "ep.c"
  timer_stop (1);
# 212 "ep.c"
  (tm) = (timer_read (1));
# 214 "ep.c"
  (nit) = (0);
# 215 "ep.c"
  if ((28) == (24))

# 216 "ep.c"
      if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8)))

# 218 "ep.c"
	  (verified) = (1);
# 220 "ep.c"
  if ((28) == (25))

# 221 "ep.c"
      if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8)))

# 223 "ep.c"
	  (verified) = (1);
# 225 "ep.c"
  if ((28) == (28))

# 226 "ep.c"
      if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8)))

# 228 "ep.c"
	  (verified) = (1);
          printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy);

# 230 "ep.c"
  if ((28) == (30))

# 231 "ep.c"
      if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8)))

# 233 "ep.c"
	  (verified) = (1);
# 235 "ep.c"
  if ((28) == (32))

# 236 "ep.c"
      if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8)))

# 238 "ep.c"
	  (verified) = (1);
# 242 "ep.c"
  (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0));
# 244 "ep.c"
    ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012",
     tm, 28, gc, sx, sy);
# 251 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)

# 252 "ep.c"
      printf ("%3d %15.0f\012", i, *((q) + (i)));
# 255 "ep.c"
  c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops,
		   "Random numbers generated", verified, "2.3", "07 Aug 2006",
		   "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm",
# 261 "ep.c"
  if ((0) == (1))

# 262 "ep.c"
      printf ("Total time:     %f", timer_read (1));
# 263 "ep.c"
      printf ("Gaussian pairs: %f", timer_read (2));
# 264 "ep.c"
      printf ("Random numbers: %f", timer_read (3));
Esempio n. 23
static void
__ompc_func_3 (void **__ompc_args)
  auto double *_pp_sx;
  auto double *_pp_sy;
  auto int *_pp_np;
  auto int *_pp_k_offset;
  auto double *_pp_an;
  auto int *_pp_nthreads;
  auto double *_ppthd_x;
  (_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x))));
  (_pp_sx) = (((double *) (*__ompc_args)));
  (_pp_sy) = (((double *) (*((__ompc_args) + (1)))));
  (_pp_np) = (((int *) (*((__ompc_args) + (2)))));
  (_pp_k_offset) = (((int *) (*((__ompc_args) + (3)))));
  (_pp_an) = (((double *) (*((__ompc_args) + (4)))));
  (_pp_nthreads) = (((int *) (*((__ompc_args) + (5)))));
  _ompc_copyin_thdprv (_ppthd_x, x, 1048576);
    auto double t1;
    auto double t2;
    auto double t3;
    auto double t4;
    auto double x1;
    auto double x2;
    auto int kk;
    auto int i;
    auto int ik;
    auto int l;
    auto double qq[10];

# 150 "ep.c"
    for ((i) = (0); (i) < (10); (i)++)
	(*((qq) + (i))) = (0.0);
      auto double _p_sx;
      auto double _p_sy;
      auto int _p_k;
      auto int _p_k_0;
      auto int _p_k_1;
      auto int _p_k_2;
      (_p_sy) = (0.0);
      (_p_sx) = (0.0);
      (_p_k_0) = (1);
      (_p_k_1) = ((*_pp_np) + (1));
      (_p_k_2) = (1);
      _ompc_static_bsched (&_p_k_0, &_p_k_1, &_p_k_2);
# 153 "ep.c"
      for ((_p_k) = (_p_k_0); (_p_k) < (_p_k_1); (_p_k) += (_p_k_2))

# 154 "ep.c"
	  (kk) = ((*_pp_k_offset) + (_p_k));
# 155 "ep.c"
	  (t1) = (2.71828183E8);
# 156 "ep.c"
	  (t2) = (*_pp_an);
# 160 "ep.c"
	  for ((i) = (1); (i) <= (100); (i)++)

# 161 "ep.c"
	      (ik) = ((kk) / (2));
# 162 "ep.c"
	      if (((2) * (ik)) != (kk))
		  (t3) = (randlc (&(t1), t2));
# 163 "ep.c"
	      if ((ik) == (0))
# 163 "ep.c"
# 164 "ep.c"
	      (t3) = (randlc (&(t2), t2));
# 165 "ep.c"
	      (kk) = (ik);
# 170 "ep.c"
	  if ((0) == (1))
	      timer_start (3);
# 171 "ep.c"
	  vranlc ((2) * ((1) << (16)), &(t1), 1.220703125E9,
		  (_ppthd_x) - (1));
# 172 "ep.c"
	  if ((0) == (1))
	      timer_stop (3);
# 179 "ep.c"
	  if ((0) == (1))
	      timer_start (2);
# 181 "ep.c"
	  for ((i) = (0); (i) < ((1) << (16)); (i)++)

# 182 "ep.c"
	      (x1) = (((2.0) * (*((_ppthd_x) + ((2) * (i))))) - (1.0));
# 183 "ep.c"
	      (x2) =
		(((2.0) * (*((_ppthd_x) + (((2) * (i)) + (1))))) - (1.0));
# 184 "ep.c"
	      (t1) = (((x1) * (x1)) + ((x2) * (x2)));
# 185 "ep.c"
	      if ((t1) <= (1.0))

# 186 "ep.c"
		  (t2) = (sqrt (((-(2.0)) * (log (t1))) / (t1)));
# 187 "ep.c"
		  (t3) = ((x1) * (t2));
# 188 "ep.c"
		  (t4) = ((x2) * (t2));
# 189 "ep.c"
		  (l) =
		      (((fabs (t3)) >
			(fabs (t4))) ? (fabs (t3)) : (fabs (t4)))));
# 190 "ep.c"
		  (*((qq) + (l))) += (1.0);
# 191 "ep.c"
		  (_p_sx) = ((_p_sx) + (t3));
# 192 "ep.c"
		  (_p_sy) = ((_p_sy) + (t4));
# 195 "ep.c"
	  if ((0) == (1))
	      timer_stop (2);
      _ompc_reduction (&_p_sy, _pp_sy, 14, 6);
      _ompc_reduction (&_p_sx, _pp_sx, 14, 6);
      _ompc_barrier ();
      _ompc_enter_critical (&__ompc_lock_critical);
# 199 "ep.c"
      for ((i) = (0); (i) <= ((10) - (1)); (i)++)
	  (*((q) + (i))) += (*((qq) + (i)));
      _ompc_exit_critical (&__ompc_lock_critical);
    if (_ompc_is_master ())
	(*_pp_nthreads) = (omp_get_num_threads ());
Esempio n. 24
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
int main(int argc, char **argv) {

    double *x, **xx, *q, **qq;

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    const int TRANSFER_X = 1;
    int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode,
    no_large_nodes, np_add, k_offset, j;
    double loc_x,loc_t1,loc_t2,loc_t3,loc_t4;
    double loc_a1,loc_a2,loc_x1,loc_x2,loc_z;
    boolean verified;
    char size[13+1];	/* character*13 */
/*     Allocate working memory       */

    x = (double*) malloc(sizeof(double) * 2*NK);
    xx = (double**) malloc(sizeof(double*) * NN);
    xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK);
    for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK);
    q = (double*) malloc(sizeof(double) * NQ);
    qq = (double**) malloc(sizeof(double*) * NN);
    qq[0] = (double*) malloc(sizeof(double) * NN * NQ);
    for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ;

c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
    np = NN;

c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
#pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    Mops = log(sqrt(fabs(max(1.0, 1.0))));


    vranlc(0, &t1, A, x);
    #pragma acc update device(x[0:2*NK])

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
      t2 = randlc(&t1, t1);

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;
    #pragma acc parallel loop
    for (k = 0; k < np; k++) {
      /* Initialize private q (qq) */
      #pragma acc loop
      for (i = 0; i < NQ; i++)
          qq[k][i] = 0.0;
      /* Initialize private x (xx)  */
      #pragma acc loop
      for (i = 0; i < 2*NK; i++)
          xx[k][i] = x[i];
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
    k_offset = -1;

    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double psx, psy;

    #pragma acc parallel loop reduction(+:sx,sy)
    for (k = 1; k <= np; k++) {
      kk = k_offset + k;
      t1 = S;
      t2 = an;

/*      Find starting seed t1 for this kk. */

      #pragma acc loop seq
      for (i = 1; i <= 100; i++) {
          ik = kk / 2;
          if (2 * ik != kk) t3 = RANDLC(&t1, t2);
          if (ik == 0) break;
          t3 = RANDLC(&t2, t2);
          kk = ik;

/*      Compute uniform pseudorandom numbers. */

      loc_t1 = r23 * A;
      loc_a1 = (int)loc_t1;
      loc_a2 = A - t23 * loc_a1;
      loc_x = t1;

      #pragma acc loop seq
      for (i = 1; i <= 2*NK; i++) {
          loc_t1 = r23 * loc_x;
          loc_x1 = (int)loc_t1;
          loc_x2 = loc_x - t23 * loc_x1;
          loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1;
          loc_t2 = (int)(r23 * loc_t1);
          loc_z = loc_t1 - t23 * loc_t2;
          loc_t3 = t23 * loc_z + loc_a2 * loc_x2;
          loc_t4 = (int)(r46 * loc_t3);
          loc_x = loc_t3 - t46 * loc_t4;
          xx[k-1][i-1] = r46 * loc_x;
      t1 = loc_x;

c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
      psx = psy = 0.0;

      #pragma acc loop reduction(+:psx,psy)
      for ( i = 0; i < NK; i++) {
          x1 = 2.0 * xx[k-1][2*i] - 1.0;
          x2 = 2.0 * xx[k-1][2*i+1] - 1.0;
          t1 = pow2(x1) + pow2(x2);
          if (t1 <= 1.0) {
            t2 = sqrt(-2.0 * log(t1) / t1);
            t3 = (x1 * t2);             /* Xi */
            t4 = (x2 * t2);             /* Yi */
            l = max(fabs(t3), fabs(t4));
            qq[k-1][l] += 1.0;                      /* counts */
            psx = psx + t3;  /* sum of Xi */
            psy = psy + t4;               /* sum of Yi */

      sx += psx;
      sy += psy;
/*      Reduce private qq to q          */
    #pragma acc parallel loop reduction(+:gc)
    for ( i = 0; i < NQ; i++ ) {
      double sumq = 0.0;
      #pragma acc loop reduction(+:sumq)
      for (k = 0; k < np; k++)
          sumq = sumq + qq[k][i];
      q[i] = sumq;
      gc += sumq;

} /* end acc data */

    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    c_print_results("EP", CLASS, M+1, 0, 0, nit,
          tm, Mops, "Random numbers generated",
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    return 0;
Esempio n. 25
int main() 
  double Mops, t1, t2, t3, t4, x1, x2;
  double sx, sy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    np;
  int    i, ik, kk, l, k, nit;
  int    k_offset, j;
  logical verified, timers_enabled;

  double dum[3] = {1.0, 1.0, 1.0};
  char   size[16];

  FILE *fp;

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;

  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number

  np = NN; 

  //  Call the random number generator functions and initialize
  //  the x-array to reduce the effects of paging on the timings.
  //  Also, call all mathematical functions that are used. Make
  //  sure these initializations cannot be eliminated as dead code.

  vranlc(0, &dum[0], dum[1], &dum[2]);
  dum[0] = randlc(&dum[1], dum[2]);
  for (i = 0; i < 2 * NK; i++) {
    x[i] = -1.0e99;
  Mops = log(sqrt(fabs(MAX(1.0, 1.0))));   


  t1 = A;
  vranlc(0, &t1, A, x);

  //  Compute AN = A ^ (2 * NK) (mod 2^46).

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);

  an = t1;
  tt = S;
  gc = 0.0;
  sx = 0.0;
  sy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;

  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others

  k_offset = -1;

  for (k = 1; k <= np; k++) {
    kk = k_offset + k; 
    t1 = S;
    t2 = an;

    // Find starting seed t1 for this kk.

    for (i = 1; i <= 100; i++) {
      ik = kk / 2;
      if ((2 * ik) != kk) t3 = randlc(&t1, t2);
      if (ik == 0) break;
      t3 = randlc(&t2, t2);
      kk = ik;

    //  Compute uniform pseudorandom numbers.
    if (timers_enabled) timer_start(2);
    vranlc(2 * NK, &t1, A, x);
    if (timers_enabled) timer_stop(2);

    //  Compute Gaussian deviates by acceptance-rejection method and 
    //  tally counts in concentri//square annuli.  This loop is not 
    //  vectorizable. 
    if (timers_enabled) timer_start(1);

    for (i = 0; i < NK; i++) {
      x1 = 2.0 * x[2*i] - 1.0;
      x2 = 2.0 * x[2*i+1] - 1.0;
      t1 = x1 * x1 + x2 * x2;
      if (t1 <= 1.0) {
        t2   = sqrt(-2.0 * log(t1) / t1);
        t3   = (x1 * t2);
        t4   = (x2 * t2);
        l    = MAX(fabs(t3), fabs(t4));
        q[l] = q[l] + 1.0;
        sx   = sx + t3;
        sy   = sy + t4;

    if (timers_enabled) timer_stop(1);

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];

  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;

  if (verified) {
    sx_err = fabs((sx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((sy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", sx, sy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);

  print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      CS2, CS3, CS4, CS5, CS6, CS7);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(1);
    printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(2);
    printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);

  return 0;
Esempio n. 26
int main(int argc, char *argv[])
  int i, j, k, it;

  double zeta;
  double rnorm;
  double norm_temp1, norm_temp2;

  double t, mflops, tmax;
  //char Class;
  logical verified;
  double zeta_verify_value, epsilon, err;

  char *t_names[T_last];

  //openmp environment setting

  for (i = 0; i < T_last; i++) {

  firstrow = 0;
  lastrow  = NA-1;
  firstcol = 0;
  lastcol  = NA-1;

  zeta_verify_value = VALID_RESULT;
  printf("\nCG start...\n\n");
  printf(" Size: %11d\n", NA);
  printf(" Iterations: %5d\n", NITER);

  naa = NA;
  nzz = NZ;

  // Inialize random number generator
  tran    = 314159265.0;
  amult   = 1220703125.0;
  zeta    = randlc(&tran, amult);

  makea(naa, nzz, a, colidx, rowstr, 
        firstrow, lastrow, firstcol, lastcol, 
        (int (*)[NONZER+1])(void*)acol, 
        (double (*)[NONZER+1])(void*)aelt,

  // Note: as a result of the above call to makea:
  //      values of j used in indexing rowstr go from 0 --> lastrow-firstrow
  //      values of colidx which are col indexes go from firstcol --> lastcol
  //      So:
  //      Shift the col index vals from actual (firstcol --> lastcol ) 
  //      to local, i.e., (0 --> lastcol-firstcol)
#pragma omp parallel for collapse(2)
  for (j = 0; j < lastrow - firstrow + 1; j++) {
    for (k = rowstr[j]; k < rowstr[j+1]; k++) {
      colidx[k] = colidx[k] - firstcol;

  // set starting vector to (1, 1, .... 1)
#pragma omp parallel for
  for (i = 0; i < NA+1; i++) {
    x[i] = 1.0;
#pragma omp parallel for
  for (j = 0; j < lastcol - firstcol + 1; j++) {
    q[j] = 0.0;
    z[j] = 0.0;
    r[j] = 0.0;
    p[j] = 0.0;

  zeta = 0.0;

  // Do one iteration untimed to init all code and data page tables
  //---->                    (then reinit, start timing, to niter its)
  for (it = 1; it <= 1; it++) {
    // The call to the conjugate gradient routine:
    conj_grad(colidx, rowstr, x, z, a, p, q, r, &rnorm);

    // zeta = shift + 1/(x.z)
    // So, first: (x.z)
    // Also, find norm of z
    // So, first: (z.z)
    norm_temp1 = 0.0;
    norm_temp2 = 0.0;
#pragma omp parallel for reduction(+:norm_temp1, norm_temp2)
    for (j = 0; j < lastcol - firstcol + 1; j++) {
      norm_temp1 = norm_temp1 + x[j] * z[j];
      norm_temp2 = norm_temp2 + z[j] * z[j];

    norm_temp2 = 1.0 / sqrt(norm_temp2);

    // Normalize z to obtain x
#pragma omp parallel for
    for (j = 0; j < lastcol - firstcol + 1; j++) {     
      x[j] = norm_temp2 * z[j];
  } // end of do one iteration untimed

  // set starting vector to (1, 1, .... 1)
#pragma omp parallel for
  for (i = 0; i < NA+1; i++) {
    x[i] = 1.0;

  zeta = 0.0;


  printf(" Initialization time = %15.3f seconds\n", timer_read(T_init));


  // Main Iteration for inverse power method
/* #pragma omp parallel for reduction(+:zeta) private(norm_temp1, norm_temp2) firstprivate(x, z, p, q) */
  for (it = 1; it <= NITER; it++) {
    // The call to the conjugate gradient routine:
    if (timeron) timer_start(T_conj_grad);
    conj_grad(colidx, rowstr, x, z, a, p, q, r, &rnorm);
    if (timeron) timer_stop(T_conj_grad);

    // zeta = shift + 1/(x.z)
    // So, first: (x.z)
    // Also, find norm of z
    // So, first: (z.z)
    norm_temp1 = 0.0;
    norm_temp2 = 0.0;
#pragma omp parallel for reduction(+:norm_temp1, norm_temp2)
    for (j = 0; j < lastcol - firstcol + 1; j++) {
      norm_temp1 = norm_temp1 + x[j]*z[j];
      norm_temp2 = norm_temp2 + z[j]*z[j];

    norm_temp2 = 1.0 / sqrt(norm_temp2);

    zeta = SHIFT + 1.0 / norm_temp1;
    if (it == 1) 
      printf("\n   iteration           ||r||                 zeta\n");
    printf("    %5d       %20.14E%20.13f\n", it, rnorm, zeta);

    // Normalize z to obtain x
#pragma omp parallel for
    for (j = 0; j < lastcol - firstcol + 1; j++) {
      x[j] = norm_temp2 * z[j];
  } // end of main iter inv pow meth


  // End of timed section

  t = timer_read(T_bench);


  epsilon = 1.0e-10;
  err = fabs(zeta - zeta_verify_value) / zeta_verify_value;
  if (err <= epsilon) {
    verified = true;
    printf(" Zeta is    %20.13E\n", zeta);
    printf(" Error is   %20.13E\n", err);
  } else {
    verified = false;
    printf(" VERIFICATION FAILED\n");
    printf(" Zeta                %20.13E\n", zeta);
    printf(" The correct zeta is %20.13E\n", zeta_verify_value);
  printf("\n\nExecution time : %lf seconds\n\n", t);
  return 0;
Esempio n. 27
File: cg.c Progetto: 8l/rose
int main(int argc, char **argv) {
    int	i, j, k, it;
    int nthreads = 1;
    double zeta;
    double rnorm;
    double norm_temp11;
    double norm_temp12;
    double t, mflops;
    char cclass;
    boolean verified;
    double zeta_verify_value, epsilon;

    firstrow = 1;
    lastrow  = NA;
    firstcol = 1;
    lastcol  = NA;

    if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) {
	cclass = 'S';
	zeta_verify_value = 8.5971775078648;
    } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) {
	cclass = 'W';
	zeta_verify_value = 10.362595087124;
    } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) {
	cclass = 'A';
	zeta_verify_value = 17.130235054029;
    } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) {
	cclass = 'B';
	zeta_verify_value = 22.712745482631;
    } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) {
	cclass = 'C';
	zeta_verify_value = 28.973605592845;
    } else {
	cclass = 'U';

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
           " - CG Benchmark\n");
    printf(" Size: %10d\n", NA);
    printf(" Iterations: %5d\n", NITER);

    naa = NA;
    nzz = NZ;

c  Initialize random number generator
    tran    = 314159265.0;
    amult   = 1220703125.0;
    zeta    = randlc( &tran, amult );

    makea(naa, nzz, a, colidx, rowstr, NONZER,
          firstrow, lastrow, firstcol, lastcol, 
	  RCOND, arow, acol, aelt, v, iv, SHIFT);
c  Note: as a result of the above call to makea:
c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
c        values of colidx which are col indexes go from firstcol --> lastcol
c        So:
c        Shift the col index vals from actual (firstcol --> lastcol ) 
c        to local, i.e., (1 --> lastcol-firstcol+1)
#pragma omp parallel private(it,i,j,k)
#pragma omp for nowait
    for (j = 1; j <= lastrow - firstrow + 1; j++) {
	for (k = rowstr[j]; k < rowstr[j+1]; k++) {
            colidx[k] = colidx[k] - firstcol + 1;

c  set starting vector to (1, 1, .... 1)
#pragma omp for nowait
    for (i = 1; i <= NA+1; i++) {
	x[i] = 1.0;
#pragma omp single
    zeta  = 0.0;

c  Do one iteration untimed to init all code and data page tables
c---->                    (then reinit, start timing, to niter its)

    for (it = 1; it <= 1; it++) {

c  The call to the conjugate gradient routine:
	conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm);

c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
#pragma omp single
	norm_temp11 = 0.0;
	norm_temp12 = 0.0;
} /* end single */

#pragma omp for reduction(+:norm_temp11,norm_temp12)
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            norm_temp11 = norm_temp11 + x[j]*z[j];
            norm_temp12 = norm_temp12 + z[j]*z[j];
#pragma omp single
	norm_temp12 = 1.0 / sqrt( norm_temp12 );

c  Normalize z to obtain x
#pragma omp for
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            x[j] = norm_temp12*z[j];
    } /* end of do one iteration untimed */

c  set starting vector to (1, 1, .... 1)
#pragma omp for nowait
    for (i = 1; i <= NA+1; i++) {
         x[i] = 1.0;
#pragma omp single    
    zeta  = 0.0;

} /* end parallel */

    timer_clear( 1 );
    timer_start( 1 );

c  Main Iteration for inverse power method

#pragma omp parallel private(it,i,j,k)
    for (it = 1; it <= NITER; it++) {

c  The call to the conjugate gradient routine:
	conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm);

c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
#pragma omp single
	norm_temp11 = 0.0;
	norm_temp12 = 0.0;
} /* end single */

#pragma omp for reduction(+:norm_temp11,norm_temp12)
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            norm_temp11 = norm_temp11 + x[j]*z[j];
            norm_temp12 = norm_temp12 + z[j]*z[j];

#pragma omp single
	norm_temp12 = 1.0 / sqrt( norm_temp12 );

	zeta = SHIFT + 1.0 / norm_temp11;
} /* end single */

#pragma omp master
	if( it == 1 ) {
            printf("   iteration           ||r||                 zeta\n");
	printf("    %5d       %20.14e%20.13e\n", it, rnorm, zeta);
} /* end master */

c  Normalize z to obtain x
#pragma omp for 
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            x[j] = norm_temp12*z[j];
    } /* end of main iter inv pow meth */

#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */
} /* end parallel */

    timer_stop( 1 );

c  End of timed section

    t = timer_read( 1 );

    printf(" Benchmark completed\n");

    epsilon = 1.0e-10;
    if (cclass != 'U') {
	if (fabs(zeta - zeta_verify_value) <= epsilon) {
            verified = TRUE;
	    printf(" Zeta is    %20.12e\n", zeta);
	    printf(" Error is   %20.12e\n", zeta - zeta_verify_value);
	} else {
            verified = FALSE;
	    printf(" VERIFICATION FAILED\n");
	    printf(" Zeta                %20.12e\n", zeta);
	    printf(" The correct zeta is %20.12e\n", zeta_verify_value);
    } else {
	verified = FALSE;
	printf(" Problem size unknown\n");

    if ( t != 0.0 ) {
	mflops = (2.0*NITER*NA)
	    * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 )
	    / t / 1000000.0;
    } else {
	mflops = 0.0;

    c_print_results("CG", cclass, NA, 0, 0, NITER, nthreads, t, 
		    mflops, "          floating point", 
		    CS1, CS2, CS3, CS4, CS5, CS6, CS7);