Exemple #1
0
Fichier : is.c Projet : 8l/insieme
int main( int argc, char **argv )
{

    int             i, iteration, timer_on;

    double          timecounter;

    FILE            *fp;


/*  Initialize timers  */
    timer_on = 0;
    if ((fp = fopen("timer.flag", "r")) != NULL) {
        fclose(fp);
        timer_on = 1;
    }
    timer_clear( 0 );
    if (timer_on) {
        timer_clear( 1 );
        timer_clear( 2 );
        timer_clear( 3 );
    }

    if (timer_on) timer_start( 3 );


/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
                break;
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
                break;
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
                break;
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
                break;
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];
                break;
            case 'D':
                test_index_array[i] = D_test_index_array[i];
                test_rank_array[i]  = D_test_rank_array[i];
                break;
        };



/*  Printout initial NPB info */
    printf
      ( "\n\n NAS Parallel Benchmarks (NPB3.3-OMP) - IS Benchmark\n\n" );
    printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
    printf( " Iterations:  %d\n", MAX_ITERATIONS );
#ifdef _OPENMP
    printf( " Number of available threads:  %d\n", omp_get_max_threads() );
#endif
    printf( "\n" );

    if (timer_on) timer_start( 1 );

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq( 314159265.00,                    /* Random number gen seed */
                1220703125.00 );                 /* Random number gen mult */

    alloc_key_buff();
    if (timer_on) timer_stop( 1 );


/*  Do one interation for free (i.e., untimed) to guarantee initialization of
    all data and code pages and respective tables */
    rank( 1 );

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */
    timer_start( 0 );


/*  This is the main iteration */
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
        if( CLASS != 'S' ) printf( "        %d\n", iteration );
        rank( iteration );
    }


/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );


/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */
    if (timer_on) timer_start( 2 );
    full_verify();
    if (timer_on) timer_stop( 2 );

    if (timer_on) timer_stop( 3 );


/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     CLASS,
                     (int)(TOTAL_KEYS/64),
                     64,
                     0,
                     MAX_ITERATIONS,
                     timecounter,
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                                                  /timecounter/1000000.,
                     "keys ranked",
                     passed_verification,
                     NPBVERSION,
                     COMPILETIME,
                     CC,
                     CLINK,
                     C_LIB,
                     C_INC,
                     CFLAGS,
                     CLINKFLAGS );


/*  Print additional timers  */
    if (timer_on) {
       double t_total, t_percent;

       t_total = timer_read( 3 );
       printf("\nAdditional timers -\n");
       printf(" Total execution: %8.3f\n", t_total);
       if (t_total == 0.0) t_total = 1.0;
       timecounter = timer_read(1);
       t_percent = timecounter/t_total * 100.;
       printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(0);
       t_percent = timecounter/t_total * 100.;
       printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(2);
       t_percent = timecounter/t_total * 100.;
       printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    }

    return 0;
         /**************************/
}        /*  E N D  P R O G R A M  */
int main(int argc, char *argv[]) {

/*-------------------------------------------------------------------------
c k is the current level. It is passed down through subroutine args
c and is NOT global. it is the current iteration
c------------------------------------------------------------------------*/

    int k, it;
    double t, tinit, mflops;
    int nthreads = 1;

/*-------------------------------------------------------------------------
c These arrays are in common because they are quite large
c and probably shouldn''t be allocated on the stack. They
c are always passed as subroutine args. 
c------------------------------------------------------------------------*/
    
    double **u, *v, **r;
    double a[4], c[4];

    double rnm2, rnmu;
    double epsilon = 1.0e-8;
    int n1, n2, n3, nit;
    double verify_value;
    boolean verified;

    int i, j, l;
    FILE *fp;

    timer_clear(T_BENCH);
    timer_clear(T_INIT);

    timer_start(T_INIT);

/*----------------------------------------------------------------------
c Read in and broadcast input data
c---------------------------------------------------------------------*/

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
	   " - MG Benchmark\n\n");

    fp = fopen("mg.input", "r");
    if (fp != NULL) {
	printf(" Reading from input file mg.input\n");
	fscanf(fp, "%d", &lt);
	while(fgetc(fp) != '\n');
	fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]);
	while(fgetc(fp) != '\n');
	fscanf(fp, "%d", &nit);
	while(fgetc(fp) != '\n');
	for (i = 0; i <= 7; i++) {
	    fscanf(fp, "%d", &debug_vec[i]);
	}
	fclose(fp);
    } else {
	printf(" No input file. Using compiled defaults\n");
    
	lt = LT_DEFAULT;
	nit = NIT_DEFAULT;
	nx[lt] = NX_DEFAULT;
	ny[lt] = NY_DEFAULT;
	nz[lt] = NZ_DEFAULT;

	for (i = 0; i <= 7; i++) {
	    debug_vec[i] = DEBUG_DEFAULT;
	}
    }

    if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) {
	Class = 'U';
    } else if( nx[lt] == 32 && nit == 4 ) {
	Class = 'S';
    } else if( nx[lt] == 64 && nit == 40 ) {
	Class = 'W';
    } else if( nx[lt] == 256 && nit == 20 ) {
	Class = 'B';
    } else if( nx[lt] == 512 && nit == 20 ) {
	Class = 'C';
    } else if( nx[lt] == 256 && nit == 4 ) {
	Class = 'A';
    } else {
	Class = 'U';
    }

/*--------------------------------------------------------------------
c  Use these for debug info:
c---------------------------------------------------------------------
c     debug_vec(0) = 1 !=> report all norms
c     debug_vec(1) = 1 !=> some setup information
c     debug_vec(1) = 2 !=> more setup information
c     debug_vec(2) = k => at level k or below, show result of resid
c     debug_vec(3) = k => at level k or below, show result of psinv
c     debug_vec(4) = k => at level k or below, show result of rprj
c     debug_vec(5) = k => at level k or below, show result of interp
c     debug_vec(6) = 1 => (unused)
c     debug_vec(7) = 1 => (unused)
c-------------------------------------------------------------------*/

    a[0] = -8.0/3.0;
    a[1] =  0.0;
    a[2] =  1.0/6.0;
    a[3] =  1.0/12.0;

    if (Class == 'A' || Class == 'S' || Class =='W') {
/*--------------------------------------------------------------------
c     Coefficients for the S(a) smoother
c-------------------------------------------------------------------*/
	c[0] =  -3.0/8.0;
	c[1] =  1.0/32.0;
	c[2] =  -1.0/64.0;
	c[3] =   0.0;
    } else {
/*--------------------------------------------------------------------
c     Coefficients for the S(b) smoother
c-------------------------------------------------------------------*/
	c[0] =  -3.0/17.0;
	c[1] =  1.0/33.0;
	c[2] =  -1.0/61.0;
	c[3] =   0.0;
    }
    
    lb = 1;

    setup(&n1,&n2,&n3,lt);
      
    /* Allocate the data arrays
     * 3d arrays are flattened and allocated as a contiguous block
     * 4d arrays are allocated as separate 3d blocks
     */
    u = (double **)malloc((lt+1)*sizeof(double *));
    for (l=lt; l >=1; l--)
      u[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double));

    v = (double *)malloc(m3[lt]*m2[lt]*m1[lt]*sizeof(double));

    r = (double **)malloc((lt+1)*sizeof(double *));
    for (l=lt; l >=1; l--)
      r[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double));

    // Array v can be treated using a standard OpenACC data region
#pragma acc data create(v[0:m3[lt]*m2[lt]*m1[lt]]) copyin(a[0:4],c[0:4])
    {

#ifdef _OPENACC
      //****************************************************************
      /* Now manually deep-create arrays u,r on the GPU using the Cray extended
       * runtime API, instead of using a data region
       */
      double **acc_u = (double **)cray_acc_create(u,(lt+1)*sizeof(double *));
      for (l=lt; l >=1; l--) {
	double *acc_ul = (double *)cray_acc_create(u[l],m3[l]*m2[l]*m1[l]*sizeof(double));
	SET_ACC_PTR(acc_u[l], acc_ul);
      }
      double **acc_r = (double **)cray_acc_create(r,(lt+1)*sizeof(double *));
      for (l=lt; l >=1; l--) {
	double *acc_rl = (double *)cray_acc_create(r[l],m3[l]*m2[l]*m1[l]*sizeof(double));
	SET_ACC_PTR(acc_r[l], acc_rl);
      }
    //****************************************************************
#endif /* _OPENACC */

#pragma omp parallel
{
    zero3(u[lt],n1,n2,n3);
}
    zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);

#pragma omp parallel
{
    norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

#pragma omp single
{
/*    printf("\n norms of random v are\n");
    printf(" %4d%19.12e%19.12e\n", 0, rnm2, rnmu);
    printf(" about to evaluate resid, k= %d\n", lt);*/

    printf(" Size: %3dx%3dx%3d (class %1c)\n",
	   nx[lt], ny[lt], nz[lt], Class);
    printf(" Iterations: %3d\n", nit);
}

    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

/*c---------------------------------------------------------------------
c     One iteration for startup
c---------------------------------------------------------------------*/
    mg3P(u,v,r,a,c,n1,n2,n3,lt);
    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);

#pragma omp single
    setup(&n1,&n2,&n3,lt);

    zero3(u[lt],n1,n2,n3);
  } /* pragma omp parallel */

    zran3(v,n1,n2,n3,nx[lt],ny[lt],lt);

    timer_stop(T_INIT);
    timer_start(T_BENCH);

#pragma omp parallel firstprivate(nit) private(it)
  {
    resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

    for ( it = 1; it <= nit; it++) {
	mg3P(u,v,r,a,c,n1,n2,n3,lt);
	resid(u[lt],v,r[lt],n1,n2,n3,a,lt);
    }
    norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]);

#if defined(_OPENMP)    
#pragma omp master
    nthreads = omp_get_num_threads();
#endif    
  } /* pragma omp parallel */

    timer_stop(T_BENCH);
    t = timer_read(T_BENCH);
    tinit = timer_read(T_INIT);

    verified = FALSE;
    verify_value = 0.0;

    printf(" Initialization time: %15.3f seconds\n", tinit);
    printf(" Benchmark completed\n");

    if (Class != 'U') {
	if (Class == 'S') {
            verify_value = 0.530770700573e-04;
	} else if (Class == 'W') {
            verify_value = 0.250391406439e-17;  /* 40 iterations*/
/*				0.183103168997d-044 iterations*/
	} else if (Class == 'A') {
            verify_value = 0.2433365309e-5;
        } else if (Class == 'B') {
            verify_value = 0.180056440132e-5;
        } else if (Class == 'C') {
            verify_value = 0.570674826298e-06;
	}

	if ( fabs( rnm2 - verify_value ) <= epsilon ) {
            verified = TRUE;
	    printf(" VERIFICATION SUCCESSFUL\n");
	    printf(" L2 Norm is %20.12e\n", rnm2);
	    printf(" Error is   %20.12e\n", rnm2 - verify_value);
	} else {
            verified = FALSE;
	    printf(" VERIFICATION FAILED\n");
	    printf(" L2 Norm is             %20.12e\n", rnm2);
	    printf(" The correct L2 Norm is %20.12e\n", verify_value);
	}
    } else {
	verified = FALSE;
	printf(" Problem size unknown\n");
	printf(" NO VERIFICATION PERFORMED\n");
    }

    if ( t != 0.0 ) {
	int nn = nx[lt]*ny[lt]*nz[lt];
	mflops = 58.*nit*nn*1.0e-6 / t;
    } else {
	mflops = 0.0;
    }

    c_print_results("MG", Class, nx[lt], ny[lt], nz[lt], 
		    nit, nthreads, t, mflops, "          floating point", 
		    verified, NPBVERSION, COMPILETIME,
		    CS1, CS2, CS3, CS4, CS5, CS6, CS7);
// I should probably deep-free the manually deep-created accelerator data here
} //acc end data
}
Exemple #3
0
int main(int argc, char *argv[])
{
  int i;
  int iter;
  double total_time, mflops;
  logical verified;
  char Class;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  //---------------------------------------------------------------------
  // Run the entire problem once to make sure all data is touched. 
  // This reduces variable startup costs, which is important for such a 
  // short benchmark. The other NPB 2 implementations are similar. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }
  setup();
  setup_opencl(argc, argv);
  init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  fft_init(dims[0]);
  fft(1, &m_u1, &m_u0);

  //---------------------------------------------------------------------
  // Start over from the beginning. Note that all operations must
  // be timed, in contrast to other benchmarks. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }

  timer_start(T_total);
  if (timers_enabled) timer_start(T_setup);

  DTIMER_START(T_compute_im);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_im);

  DTIMER_START(T_compute_ics);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_ics);

  DTIMER_START(T_fft_init);
  fft_init(dims[0]);
  DTIMER_STOP(T_fft_init);

  if (timers_enabled) timer_stop(T_setup);
  if (timers_enabled) timer_start(T_fft);
  fft(1, &m_u1, &m_u0);
  if (timers_enabled) timer_stop(T_fft);

  for (iter = 1; iter <= niter; iter++) {
    if (timers_enabled) timer_start(T_evolve);
    evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_evolve);
    if (timers_enabled) timer_start(T_fft);
    fft(-1, &m_u1, &m_u1);
    if (timers_enabled) timer_stop(T_fft);
    if (timers_enabled) timer_start(T_checksum);
    checksum(iter, &m_u1, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_checksum);
  }

  verify(NX, NY, NZ, niter, &verified, &Class);

  timer_stop(T_total);
  total_time = timer_read(T_total);

  if (total_time != 0.0) {
    mflops = 1.0e-6 * (double)NTOTAL *
            (14.8157 + 7.19641 * log((double)NTOTAL)
            + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter)
            / total_time;
  } else {
    mflops = 0.0;
  }
  c_print_results("FT", Class, NX, NY, NZ, niter,
                  total_time, mflops, "          floating point", verified, 
                  NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7,
                  clu_GetDeviceTypeName(device_type),
                  device_name);
  if (timers_enabled) print_timers();

  release_opencl();

  fflush(stdout);

  return 0;
}
Exemple #4
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode,
	no_large_nodes, np_add, k_offset, j;
    int nthreads = 1;
    boolean verified;
    char size[13+1];	/* character*13 */

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/

    printf("\n\n NAS Parallel Benchmarks 3.0 structured OpenMP C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    
#pragma omp parallel for default(shared) private(i)
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
	t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;

    for ( i = 0; i <= NQ - 1; i++) {
	q[i] = 0.0;
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

#pragma omp parallel copyin(x)
{
    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double qq[NQ];		/* private copy of q[0:NQ-1] */

    for (i = 0; i < NQ; i++) qq[i] = 0.0;

#pragma omp for reduction(+:sx,sy) schedule(static)  
    for (k = 1; k <= np; k++) {
	kk = k_offset + k;
	t1 = S;
	t2 = an;

/*      Find starting seed t1 for this kk. */

	for (i = 1; i <= 100; i++) {
            ik = kk / 2;
            if (2 * ik != kk) t3 = randlc(&t1, t2);
            if (ik == 0) break;
            t3 = randlc(&t2, t2);
            kk = ik;
	}

/*      Compute uniform pseudorandom numbers. */

	if (TIMERS_ENABLED == TRUE) timer_start(3);
	vranlc(2*NK, &t1, A, x-1);
	if (TIMERS_ENABLED == TRUE) timer_stop(3);

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
	if (TIMERS_ENABLED == TRUE) timer_start(2);

	for ( i = 0; i < NK; i++) {
            x1 = 2.0 * x[2*i] - 1.0;
            x2 = 2.0 * x[2*i+1] - 1.0;
            t1 = pow2(x1) + pow2(x2);
            if (t1 <= 1.0) {
		t2 = sqrt(-2.0 * log(t1) / t1);
		t3 = (x1 * t2);				/* Xi */
		t4 = (x2 * t2);				/* Yi */
		l = max(fabs(t3), fabs(t4));
		qq[l] += 1.0;				/* counts */
		sx = sx + t3;				/* sum of Xi */
		sy = sy + t4;				/* sum of Yi */
            }
	}
	if (TIMERS_ENABLED == TRUE) timer_stop(2);
    }
#pragma omp critical
    {
      for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
    }
#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
} /* end of parallel region */    

    for (i = 0; i <= NQ-1; i++) {
        gc = gc + q[i];
    }

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads,
		  tm, Mops, 	
		  "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    if (TIMERS_ENABLED == TRUE) {
	printf("Total time:     %f", timer_read(1));
	printf("Gaussian pairs: %f", timer_read(2));
	printf("Random numbers: %f", timer_read(3));
    }
}
Exemple #5
0
int main( int argc, char **argv )
{
    MPI_Init(&argc,&argv);
    
    INT_TYPE chunk;
    int ini, fim;
    int             i, j, iteration, timer_on;
    double          timecounter;
    FILE            *fp;
    int myrank;
    MPI_Status      st;

    MPI_Comm_rank(MPI_COMM_WORLD,&myrank); 
    MPI_Comm_size(MPI_COMM_WORLD,&NUM_THREADS);

    if (myrank == 0) {
/*  Initialize timers  */
    timer_on = 0;            
    if ((fp = fopen("timer.flag", "r")) != NULL) {
        fclose(fp);
        timer_on = 1;
    }
    timer_clear( 0 );
    if (timer_on) {
        timer_clear( 1 );
        timer_clear( 2 );
        timer_clear( 3 );
    }

    if (timer_on) timer_start( 3 );


/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
                break;
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
                break;
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
                break;
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
                break;
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];
                break;
            case 'D':
                test_index_array[i] = D_test_index_array[i];
                test_rank_array[i]  = D_test_rank_array[i];
                break;
        };

        

/*  Printout initial NPB info */
    printf
      ( "\n\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\n\n" );
    printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
    printf( " Number of available threads:  %d\n", NUM_THREADS );
    printf( " Iterations:   %d\n", MAX_ITERATIONS );

    if (timer_on) timer_start( 1 );
  }

    R23 = pow(2, -23);
    T23 = pow(2,  23);
    R46 = pow(2, -46);
    T46 = pow(2,  46);

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq(myrank);

    if (myrank == 0) {
      // sincronizar resultados
      for (i = 1; i < NUM_THREADS; i++) {
        chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS;
        ini = chunk * i;
        fim = ini + chunk;
        if ( fim > NUM_KEYS ) {
          fim = NUM_KEYS;
        }
        MPI_Recv( &aux_key_array[ini], (fim - ini), MPI_INT, i, 0, MPI_COMM_WORLD, &st );
        for (j = ini; j < fim; j++) {
          key_array[j] = aux_key_array[j];
        }
      }
    } else {
      chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS;
      ini = chunk * myrank;
      fim = ini + chunk;
      if ( fim > NUM_KEYS ) {
        fim = NUM_KEYS;
      }
      // enviar resultados
      MPI_Send( &key_array[ini], (fim - ini), MPI_INT, 0, 0, MPI_COMM_WORLD );
    }


    if (myrank == 0) {
    if (timer_on) {
      timer_stop( 1 );
    }


/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
    all data and code pages and respective tables */
    rank( 1 );  

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */             
    timer_start( 0 );


/*  This is the main iteration */
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
        if( CLASS != 'S' ) printf( "        %d\n", iteration );
        rank( iteration );
    }


/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );


/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */
    if (timer_on) timer_start( 2 );
    full_verify();
    if (timer_on) timer_stop( 2 );

    if (timer_on) timer_stop( 3 );


/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     CLASS,
                     (int)(TOTAL_KEYS/64),
                     64,
                     0,
                     MAX_ITERATIONS,
                     timecounter,
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                                                  /timecounter/1000000.,
                     "keys ranked", 
                     passed_verification,
                     NPBVERSION,
                     COMPILETIME,
                     CC,
                     CLINK,
                     C_LIB,
                     C_INC,
                     CFLAGS,
                     CLINKFLAGS );


/*  Print additional timers  */
    if (timer_on) {
       double t_total, t_percent;

       t_total = timer_read( 3 );
       printf("\nAdditional timers -\n");
       printf(" Total execution: %8.3f\n", t_total);
       if (t_total == 0.0) t_total = 1.0;
       timecounter = timer_read(1);
       t_percent = timecounter/t_total * 100.;
       printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(0);
       t_percent = timecounter/t_total * 100.;
       printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(2);
       t_percent = timecounter/t_total * 100.;
       printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    }
  }

    MPI_Finalize();
    return 0;
         /**************************/
}        /*  E N D  P R O G R A M  */
Exemple #6
0
int main( int argc, char **argv )
{

    int             i, iteration, itemp;

    double          timecounter, maxtime;


    /*  Initialize MPI */
    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );


    /*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
        case 'S':
            test_index_array[i] = S_test_index_array[i];
            test_rank_array[i]  = S_test_rank_array[i];
            break;
        case 'A':
            test_index_array[i] = A_test_index_array[i];
            test_rank_array[i]  = A_test_rank_array[i];
            break;
        case 'W':
            test_index_array[i] = W_test_index_array[i];
            test_rank_array[i]  = W_test_rank_array[i];
            break;
        case 'B':
            test_index_array[i] = B_test_index_array[i];
            test_rank_array[i]  = B_test_rank_array[i];
            break;
        case 'C':
            test_index_array[i] = C_test_index_array[i];
            test_rank_array[i]  = C_test_rank_array[i];
            break;
        case 'D':
            test_index_array[i] = D_test_index_array[i];
            test_rank_array[i]  = D_test_rank_array[i];
            break;
        };



    /*  Printout initial NPB info */
    if( my_rank == 0 )
    {
        FILE *fp;
        printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
        printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
        printf( " Iterations:   %d\n", MAX_ITERATIONS );
        printf( " Number of processes:     %d\n", comm_size );

        fp = fopen("timer.flag", "r");
        timeron = 0;
        if (fp) {
            timeron = 1;
            fclose(fp);
        }
    }

    /*  Check that actual and compiled number of processors agree */
    if( comm_size != NUM_PROCS )
    {
        if( my_rank == 0 )
            printf( "\n ERROR: compiled for %d processes\n"
                    " Number of active processes: %d\n"
                    " Exiting program!\n\n", NUM_PROCS, comm_size );
        MPI_Finalize();
        exit( 1 );
    }

    /*  Check to see whether total number of processes is within bounds.
        This could in principle be checked in setparams.c, but it is more
        convenient to do it here                                               */
    if( comm_size < MIN_PROCS || comm_size > MAX_PROCS)
    {
        if( my_rank == 0 )
            printf( "\n ERROR: number of processes %d not within range %d-%d"
                    "\n Exiting program!\n\n", comm_size, MIN_PROCS, MAX_PROCS);
        MPI_Finalize();
        exit( 1 );
    }

    MPI_Bcast(&timeron, 1, MPI_INT, 0, MPI_COMM_WORLD);

#ifdef  TIMING_ENABLED
    for( i=1; i<=T_LAST; i++ ) timer_clear( i );
#endif

    /*  Generate random number sequence and subsequent keys on all procs */
    create_seq( find_my_seed( my_rank,
                              comm_size,
                              4*(long)TOTAL_KEYS*MIN_PROCS,
                              314159265.00,      /* Random number gen seed */
                              1220703125.00 ),   /* Random number gen mult */
                1220703125.00 );                 /* Random number gen mult */


    /*  Do one interation for free (i.e., untimed) to guarantee initialization of
        all data and code pages and respective tables */
    rank( 1 );

    /*  Start verification counter */
    passed_verification = 0;

    if( my_rank == 0 && CLASS != 'S' ) printf( "\n   iteration\n" );

    /*  Initialize timer  */
    timer_clear( 0 );

    /*  Initialize separate communication, computation timing */
#ifdef  TIMING_ENABLED
    for( i=1; i<=T_LAST; i++ ) timer_clear( i );
#endif

    /*  Start timer  */
    timer_start( 0 );


    /*  This is the main iteration */
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
        if( my_rank == 0 && CLASS != 'S' ) printf( "        %d\n", iteration );
        rank( iteration );
    }


    /*  Stop timer, obtain time for processors */
    timer_stop( 0 );

    timecounter = timer_read( 0 );

    /*  End of timing, obtain maximum time of all processors */
    MPI_Reduce( &timecounter,
                &maxtime,
                1,
                MPI_DOUBLE,
                MPI_MAX,
                0,
                MPI_COMM_WORLD );


    /*  This tests that keys are in sequence: sorting of last ranked key seq
        occurs here, but is an untimed operation                             */
    full_verify();


    /*  Obtain verification counter sum */
    itemp = passed_verification;
    MPI_Reduce( &itemp,
                &passed_verification,
                1,
                MPI_INT,
                MPI_SUM,
                0,
                MPI_COMM_WORLD );



    /*  The final printout  */
    if( my_rank == 0 )
    {
        if( passed_verification != 5*MAX_ITERATIONS + comm_size )
            passed_verification = 0;
        c_print_results( "IS",
                         CLASS,
                         (int)(TOTAL_KEYS),
                         MIN_PROCS,
                         0,
                         MAX_ITERATIONS,
                         NUM_PROCS,
                         comm_size,
                         maxtime,
                         ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
                         /maxtime/1000000.,
                         "keys ranked",
                         passed_verification,
                         NPBVERSION,
                         COMPILETIME,
                         MPICC,
                         CLINK,
                         CMPI_LIB,
                         CMPI_INC,
                         CFLAGS,
                         CLINKFLAGS );
    }


#ifdef  TIMING_ENABLED
    if (timeron)
    {
        double    t1[T_LAST+1], tmin[T_LAST+1], tsum[T_LAST+1], tmax[T_LAST+1];
        char      t_recs[T_LAST+1][9];

        for( i=0; i<=T_LAST; i++ )
            t1[i] = timer_read( i );

        MPI_Reduce( t1,
                    tmin,
                    T_LAST+1,
                    MPI_DOUBLE,
                    MPI_MIN,
                    0,
                    MPI_COMM_WORLD );
        MPI_Reduce( t1,
                    tsum,
                    T_LAST+1,
                    MPI_DOUBLE,
                    MPI_SUM,
                    0,
                    MPI_COMM_WORLD );
        MPI_Reduce( t1,
                    tmax,
                    T_LAST+1,
                    MPI_DOUBLE,
                    MPI_MAX,
                    0,
                    MPI_COMM_WORLD );

        if( my_rank == 0 )
        {
            strcpy( t_recs[T_TOTAL],  "total" );
            strcpy( t_recs[T_RANK],   "rcomp" );
            strcpy( t_recs[T_RCOMM],  "rcomm" );
            strcpy( t_recs[T_VERIFY], "verify");
            printf( " nprocs = %6d     ", comm_size);
            printf( "     minimum     maximum     average\n" );
            for( i=0; i<=T_LAST; i++ )
            {
                printf( " timer %2d (%-8s):  %10.4f  %10.4f  %10.4f\n",
                        i+1, t_recs[i], tmin[i], tmax[i],
                        tsum[i]/((double) comm_size) );
            }
            printf( "\n" );
        }
    }
#endif

    MPI_Finalize();


    return 0;
    /**************************/
}        /*  E N D  P R O G R A M  */
int main(int argc, char** argv )
{

    int             i, iteration, itemp;
    int		    nthreads = 1;
    double          timecounter, maxtime;



/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
                break;
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
                break;
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
                break;
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
                break;
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];
                break;
        };

        

/*  Printout initial NPB info */
    printf( "\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
	    " - IS Benchmark\n\n" );
    printf( " Size:  %d  (class %c)\n", TOTAL_KEYS, CLASS );
    printf( " Iterations:   %d\n", MAX_ITERATIONS );

/*  Initialize timer  */             
    timer_clear( 0 );

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq( 314159265.00,                    /* Random number gen seed */
                1220703125.00 );                 /* Random number gen mult */


/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
    all data and code pages and respective tables */
#pragma omp parallel    
    rank( 1 );  

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */             
    timer_start( 0 );


/*  This is the main iteration */
    
#pragma omp parallel private(iteration)    
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
#pragma omp master	
        if( CLASS != 'S' ) printf( "        %d\n", iteration );
	
        rank( iteration );
	
#if defined(_OPENMP)	
#pragma omp master
	nthreads = omp_get_num_threads();
#endif /* _OPENMP */	
    }

/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );


/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */
    full_verify();



/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     CLASS,
                     TOTAL_KEYS,
                     0,
                     0,
                     MAX_ITERATIONS,
		     nthreads,
                     timecounter,
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                                                  /timecounter/1000000.,
                     "keys ranked", 
                     passed_verification,
                     NPBVERSION,
                     COMPILETIME,
                     CC,
                     CLINK,
                     C_LIB,
                     C_INC,
                     CFLAGS,
                     CLINKFLAGS,
		     "randlc2");


  return 0;
         /**************************/
}        /*  E N D  P R O G R A M  */
Exemple #8
0
int main(int argc,char **argv ){
  int my_rank,comm_size;
  int i;
  DGraph *dg=NULL;
  int verified=0, featnum=0;
  double bytes_sent=2.0,tot_time=0.0;



    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
    TRACE_smpi_set_category ("begin");

     if(argc!=2||
                (  strncmp(argv[1],"BH",2)!=0
                 &&strncmp(argv[1],"WH",2)!=0
                 &&strncmp(argv[1],"SH",2)!=0
                )
      ){
      if(my_rank==0){
        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
        fprintf(stderr,"   - S is the class S, W, or A \n");
        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
        fprintf(stderr,"     the number of nodes in the graph\n");
      }
      MPI_Finalize();
      exit(0);
    } 
   if(strncmp(argv[1],"BH",2)==0){
      dg=buildBH(CLASS);
    }else if(strncmp(argv[1],"WH",2)==0){
      dg=buildWH(CLASS);
    }else if(strncmp(argv[1],"SH",2)==0){
      dg=buildSH(CLASS);
    }

    if(timer_on&&dg->numNodes+1>timers_tot){
      timer_on=0;
      if(my_rank==0)
        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
    }
    if(dg->numNodes>comm_size){
      if(my_rank==0){
        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
        fprintf(stderr,"**  the number of nodes in the graph\n");
        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
      }
      MPI_Finalize();
      exit(0);
    }
    for(i=0;i<dg->numNodes;i++){ 
      dg->node[i]->address=i;
    }
    if( my_rank == 0 ){
      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
      graphShow(dg,0);
      timer_clear(0);
      timer_start(0);
    }

    verified=ProcessNodes(dg,my_rank);
    TRACE_smpi_set_category ("end");
 
    featnum=NUM_SAMPLES*fielddim;
    bytes_sent=featnum*dg->numArcs;
    bytes_sent/=1048576;
    if(my_rank==0){
      timer_stop(0);
      tot_time=timer_read(0);
      c_print_results( dg->name,
                 CLASS,
                 featnum,
                 0,
                 0,
                 dg->numNodes,
                 0,
                 comm_size,
                 tot_time,
                 bytes_sent/tot_time,
                 "bytes transmitted", 
                 verified,
                 NPBVERSION,
                 COMPILETIME,
                 MPICC,
                 CLINK,
                 CMPI_LIB,
                 CMPI_INC,
                 CFLAGS,
                 CLINKFLAGS );
    }          
    MPI_Finalize();
  return 1;
}
Exemple #9
0
int main(int argc, char *argv[])
{
  char Class;
  logical verified;
  double mflops;

  double t, tmax, trecs[t_last+1];
  int i;
  char *t_names[t_last+1];

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  //---------------------------------------------------------------------
  // Setup info for timers
  //---------------------------------------------------------------------
  FILE *fp;
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    timeron = true;
    t_names[t_total] = "total";
    t_names[t_rhsx] = "rhsx";
    t_names[t_rhsy] = "rhsy";
    t_names[t_rhsz] = "rhsz";
    t_names[t_rhs] = "rhs";
    t_names[t_jacld] = "jacld";
    t_names[t_blts] = "blts";
    t_names[t_jacu] = "jacu";
    t_names[t_buts] = "buts";
    t_names[t_add] = "add";
    t_names[t_l2norm] = "l2norm";

    t_names[t_setbv] = "setbv";
    t_names[t_setiv] = "setiv";
    t_names[t_erhs] = "erhs";
    t_names[t_error] = "error";
    t_names[t_pintgr] = "pintgr";
    t_names[t_blts1] = "blts1";
    t_names[t_buts1] = "buts1";
    fclose(fp);
  } else {
    timeron = false;
  }

  //---------------------------------------------------------------------
  // read input data
  //---------------------------------------------------------------------
  read_input();

  //---------------------------------------------------------------------
  // set up domain sizes
  //---------------------------------------------------------------------
  domain();

  //---------------------------------------------------------------------
  // set up OpenCL environment
  //---------------------------------------------------------------------
  setup_opencl(argc, argv);

  //---------------------------------------------------------------------
  // set up coefficients
  //---------------------------------------------------------------------
  setcoeff();

  //---------------------------------------------------------------------
  // set the boundary values for dependent variables
  //---------------------------------------------------------------------
  setbv();

  //---------------------------------------------------------------------
  // set the initial values for dependent variables
  //---------------------------------------------------------------------
  setiv();

  //---------------------------------------------------------------------
  // compute the forcing term based on prescribed exact solution
  //---------------------------------------------------------------------
  erhs();

  //---------------------------------------------------------------------
  // perform one SSOR iteration to touch all data pages
  //---------------------------------------------------------------------
  ssor(1);

  //---------------------------------------------------------------------
  // reset the boundary and initial values
  //---------------------------------------------------------------------
  setbv();
  setiv();

  //---------------------------------------------------------------------
  // perform the SSOR iterations
  //---------------------------------------------------------------------
  ssor(itmax);

  //---------------------------------------------------------------------
  // compute the solution error
  //---------------------------------------------------------------------
  error();

  //---------------------------------------------------------------------
  // compute the surface integral
  //---------------------------------------------------------------------
  pintgr();

  //---------------------------------------------------------------------
  // verification test
  //---------------------------------------------------------------------
  verify ( rsdnm, errnm, frc, &Class, &verified );
  mflops = (double)itmax * (1984.77 * (double)nx0
      * (double)ny0
      * (double)nz0
      - 10923.3 * pow(((double)(nx0+ny0+nz0)/3.0), 2.0) 
      + 27770.9 * (double)(nx0+ny0+nz0)/3.0
      - 144010.0)
    / (maxtime*1000000.0);

  c_print_results("LU", Class, nx0,
                  ny0, nz0, itmax,
                  maxtime, mflops, "          floating point", verified, 
                  NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, 
                  "(none)",
                  clu_GetDeviceTypeName(device_type),
                  device_name);

  //---------------------------------------------------------------------
  // More timers
  //---------------------------------------------------------------------
  if (timeron) {
    for (i = 1; i <= t_last; i++) {
      trecs[i] = timer_read(i);
    }
    tmax = maxtime;
    if (tmax == 0.0) tmax = 1.0;

    printf("  SECTION     Time (secs)\n");
    for (i = 1; i <= t_last; i++) {
      printf("  %-8s:%9.4f  (%6.2f%%)\n",
          t_names[i], trecs[i], trecs[i]*100./tmax);
      if (i == t_rhs) {
        t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz];
        printf("     --> %8s:%9.3f  (%6.2f%%)\n", "sub-rhs", t, t*100./tmax);
        t = trecs[i] - t;
        printf("     --> %8s:%9.3f  (%6.2f%%)\n", "rest-rhs", t, t*100./tmax);
      }
    }
  }

  release_opencl();

  fflush(stdout);

  return 0;
}
Exemple #10
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double *x, **xx, *q, **qq;

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    const int TRANSFER_X = 1;
    int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode,
    no_large_nodes, np_add, k_offset, j;
    double loc_x,loc_t1,loc_t2,loc_t3,loc_t4;
    double loc_a1,loc_a2,loc_x1,loc_x2,loc_z;
    boolean verified;
    char size[13+1];	/* character*13 */
    
/*     Allocate working memory       */

    x = (double*) malloc(sizeof(double) * 2*NK);
    xx = (double**) malloc(sizeof(double*) * NN);
    xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK);
    for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK);
    q = (double*) malloc(sizeof(double) * NQ);
    qq = (double**) malloc(sizeof(double*) * NN);
    qq[0] = (double*) malloc(sizeof(double) * NN * NQ);
    for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ;

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
#pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \
    copyout(q[0:NQ])
{
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);
    #pragma acc update device(x[0:2*NK])

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
      t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;
    
    #pragma acc parallel loop
    for (k = 0; k < np; k++) {
      /* Initialize private q (qq) */
      #pragma acc loop
      for (i = 0; i < NQ; i++)
          qq[k][i] = 0.0;
      /* Initialize private x (xx)  */
      #pragma acc loop
      for (i = 0; i < 2*NK; i++)
          xx[k][i] = x[i];
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double psx, psy;

    #pragma acc parallel loop reduction(+:sx,sy)
    for (k = 1; k <= np; k++) {
      kk = k_offset + k;
      t1 = S;
      t2 = an;

/*      Find starting seed t1 for this kk. */

      #pragma acc loop seq
      for (i = 1; i <= 100; i++) {
          ik = kk / 2;
          if (2 * ik != kk) t3 = RANDLC(&t1, t2);
          if (ik == 0) break;
          t3 = RANDLC(&t2, t2);
          kk = ik;
      }

/*      Compute uniform pseudorandom numbers. */

      loc_t1 = r23 * A;
      loc_a1 = (int)loc_t1;
      loc_a2 = A - t23 * loc_a1;
      loc_x = t1;

      #pragma acc loop seq
      for (i = 1; i <= 2*NK; i++) {
          loc_t1 = r23 * loc_x;
          loc_x1 = (int)loc_t1;
          loc_x2 = loc_x - t23 * loc_x1;
          loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1;
          loc_t2 = (int)(r23 * loc_t1);
          loc_z = loc_t1 - t23 * loc_t2;
          loc_t3 = t23 * loc_z + loc_a2 * loc_x2;
          loc_t4 = (int)(r46 * loc_t3);
          loc_x = loc_t3 - t46 * loc_t4;
          xx[k-1][i-1] = r46 * loc_x;
      }
      t1 = loc_x;

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
 
      psx = psy = 0.0;

      #pragma acc loop reduction(+:psx,psy)
      for ( i = 0; i < NK; i++) {
          x1 = 2.0 * xx[k-1][2*i] - 1.0;
          x2 = 2.0 * xx[k-1][2*i+1] - 1.0;
          t1 = pow2(x1) + pow2(x2);
          if (t1 <= 1.0) {
            t2 = sqrt(-2.0 * log(t1) / t1);
            t3 = (x1 * t2);             /* Xi */
            t4 = (x2 * t2);             /* Yi */
            l = max(fabs(t3), fabs(t4));
            qq[k-1][l] += 1.0;                      /* counts */
            psx = psx + t3;  /* sum of Xi */
            psy = psy + t4;               /* sum of Yi */
          }
      }

      sx += psx;
      sy += psy;
      
    }
    
/*      Reduce private qq to q          */
    #pragma acc parallel loop reduction(+:gc)
    for ( i = 0; i < NQ; i++ ) {
      double sumq = 0.0;
      #pragma acc loop reduction(+:sumq)
      for (k = 0; k < np; k++)
          sumq = sumq + qq[k][i];
      q[i] = sumq;
      gc += sumq;
    }

} /* end acc data */

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit,
          tm, Mops, "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    return 0;
}
Exemple #11
0
int main( int argc, char **argv )
{

  int             i, iteration;

  double          timecounter;

  FILE            *fp;

  cl_int ecode;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  /*  Initialize timers  */
  timer_on = 0;            
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    fclose(fp);
    timer_on = 1;
  }
  timer_clear( 0 );
  if (timer_on) {
    timer_clear( 1 );
    timer_clear( 2 );
    timer_clear( 3 );
  }

  if (timer_on) timer_start( 3 );

  /*  Initialize the verification arrays if a valid class */
  for( i=0; i<TEST_ARRAY_SIZE; i++ )
    switch( CLASS )
    {
      case 'S':
        test_index_array[i] = S_test_index_array[i];
        test_rank_array[i]  = S_test_rank_array[i];
        break;
      case 'A':
        test_index_array[i] = A_test_index_array[i];
        test_rank_array[i]  = A_test_rank_array[i];
        break;
      case 'W':
        test_index_array[i] = W_test_index_array[i];
        test_rank_array[i]  = W_test_rank_array[i];
        break;
      case 'B':
        test_index_array[i] = B_test_index_array[i];
        test_rank_array[i]  = B_test_rank_array[i];
        break;
      case 'C':
        test_index_array[i] = C_test_index_array[i];
        test_rank_array[i]  = C_test_rank_array[i];
        break;
      case 'D':
        test_index_array[i] = D_test_index_array[i];
        test_rank_array[i]  = D_test_rank_array[i];
        break;
    };

  /* set up the OpenCL environment. */
  setup_opencl(argc, argv);

  /*  Printout initial NPB info */
  printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" );
  printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
  printf( " Iterations:   %d\n", MAX_ITERATIONS );

  if (timer_on) timer_start( 1 );

  /*  Generate random number sequence and subsequent keys on all procs */
  create_seq( 314159265.00,                    /* Random number gen seed */
              1220703125.00 );                 /* Random number gen mult */
  if (timer_on) timer_stop( 1 );

  /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
      all data and code pages and respective tables */
  rank( 1 );  

  /*  Start verification counter */
  passed_verification = 0;

  DTIMER_START(T_BUFFER_WRITE);
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_passed_verification,
                               CL_TRUE,
                               0,
                               sizeof(cl_int),
                               &passed_verification,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_WRITE);

  if( CLASS != 'S' ) printf( "\n   iteration\n" );

  /*  Start timer  */             
  timer_start( 0 );


  /*  This is the main iteration */
  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
  {
    if( CLASS != 'S' ) printf( "        %d\n", iteration );
    rank( iteration );
  }

  DTIMER_START(T_BUFFER_READ);
  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_passed_verification,
                              CL_TRUE,
                              0,
                              sizeof(cl_int),
                              &passed_verification,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_READ);

  /*  End of timing, obtain maximum time of all processors */
  timer_stop( 0 );
  timecounter = timer_read( 0 );


  /*  This tests that keys are in sequence: sorting of last ranked key seq
      occurs here, but is an untimed operation                             */
  if (timer_on) timer_start( 2 );
  full_verify();
  if (timer_on) timer_stop( 2 );

  if (timer_on) timer_stop( 3 );


  /*  The final printout  */
  if( passed_verification != 5*MAX_ITERATIONS + 1 )
    passed_verification = 0;
  c_print_results( "IS",
                   CLASS,
                   (int)(TOTAL_KEYS/64),
                   64,
                   0,
                   MAX_ITERATIONS,
                   timecounter,
                   ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                              /timecounter/1000000.,
                   "keys ranked", 
                   passed_verification,
                   NPBVERSION,
                   COMPILETIME,
                   CC,
                   CLINK,
                   C_LIB,
                   C_INC,
                   CFLAGS,
                   CLINKFLAGS,
                   "",
                   clu_GetDeviceTypeName(device_type),
                   device_name);

  /*  Print additional timers  */
  if (timer_on) {
    double t_total, t_percent;

    t_total = timer_read( 3 );
    printf("\nAdditional timers -\n");
    printf(" Total execution: %8.3f\n", t_total);
    if (t_total == 0.0) t_total = 1.0;
    timecounter = timer_read(1);
    t_percent = timecounter/t_total * 100.;
    printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(0);
    t_percent = timecounter/t_total * 100.;
    printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(2);
    t_percent = timecounter/t_total * 100.;
    printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
  }

  release_opencl();
  
  fflush(stdout);

  return 0;
  /**************************/
} /*  E N D  P R O G R A M  */
int
main (int argc, char **argv)
{
  //auto double *_ppthd_x;
  auto double Mops;
  auto double t1;
  auto double t2;
  auto double t3;
  auto double t4;
  auto double x1;
  auto double x2;
  auto double sx;
  auto double sy;
  auto double tm;
  auto double an;
  auto double tt;
  auto double gc;
  auto double dum[3];
  auto int np;
  auto int ierr;
  auto int node;
  auto int no_nodes;
  auto int i;
  auto int ik;
  auto int kk;
  auto int l;
  auto int k;
  auto int nit;
  auto int ierrcode;
  auto int no_large_nodes;
  auto int np_add;
  auto int k_offset;
  auto int j;
  auto int nthreads;
  auto int verified;
  auto char size[14];
 int status = 0;
  _ompc_init(argc,argv);

  //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x))));
  (*(dum)) = (1.0);
  (*((dum) + (1))) = (1.0);
  (*((dum) + (2))) = (1.0);
  (nthreads) = (1);
# 84 "ep.c"
  printf
    ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012");
# 86 "ep.c"
  sprintf (size, "%12.0f", pow (2.0, (28) + (1)));
# 87 "ep.c"
  for ((j) = (13); (j) >= (1); (j)--)
    {

# 88 "ep.c"
      if ((((int) (*((size) + (j))))) == (46))
	{
	  (*((size) + (j))) = (((char) (32)));
	}
    }
# 90 "ep.c"
  printf (" Number of random numbers generated: %13s\012", size);
# 92 "ep.c"
  (verified) = (0);
# 99 "ep.c"
  (np) = ((1) << ((28) - (16)));
# 107 "ep.c"
  vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2));
# 108 "ep.c"
  (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2))));
# 109 "ep.c"
  for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++)
    {
      x[i] = (-(1.0E99));
      //(*((_ppthd_x) + (i))) = (-(1.0E99));
    }
# 110 "ep.c"
  (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0)))));
# 112 "ep.c"
  timer_clear (1);
# 113 "ep.c"
  timer_clear (2);
# 114 "ep.c"
  timer_clear (3);
# 115 "ep.c"
  timer_start (1);
# 117 "ep.c"
  vranlc (0, &(t1), 1.220703125E9, x);
  //vranlc (0, &(t1), 1.220703125E9, _ppthd_x);
# 121 "ep.c"
  (t1) = (1.220703125E9);
# 123 "ep.c"
  for ((i) = (1); (i) <= ((16) + (1)); (i)++)
    {

# 124 "ep.c"
      (t2) = (randlc (&(t1), t1));
    }
# 127 "ep.c"
  (an) = (t1);
# 128 "ep.c"
  (tt) = (2.71828183E8);
# 129 "ep.c"
  (gc) = (0.0);
# 130 "ep.c"
  (sx) = (0.0);
# 131 "ep.c"
  (sy) = (0.0);
# 133 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 134 "ep.c"
      (*((q) + (i))) = (0.0);
    }
# 142 "ep.c"
  (k_offset) = (-(1));
  {
    auto void *__ompc_argv[6];
    (*(__ompc_argv)) = (((void *) (&sx)));
    (*((__ompc_argv) + (1))) = (((void *) (&sy)));
    (*((__ompc_argv) + (2))) = (((void *) (&np)));
    (*((__ompc_argv) + (3))) = (((void *) (&k_offset)));
    (*((__ompc_argv) + (4))) = (((void *) (&an)));
    (*((__ompc_argv) + (5))) = (((void *) (&nthreads)));
    _ompc_do_parallel (__ompc_func_3, __ompc_argv);
  }
# 207 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 208 "ep.c"
      (gc) = ((gc) + (*((q) + (i))));
    }
# 211 "ep.c"
  timer_stop (1);
# 212 "ep.c"
  (tm) = (timer_read (1));
# 214 "ep.c"
  (nit) = (0);
# 215 "ep.c"
  if ((28) == (24))
    {

# 216 "ep.c"
      if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8)))
	{

# 218 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 220 "ep.c"
  if ((28) == (25))
    {

# 221 "ep.c"
      if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8)))
	{

# 223 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 225 "ep.c"
  if ((28) == (28))
    {

# 226 "ep.c"
      if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8)))
	{

# 228 "ep.c"
	  (verified) = (1);
          printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy);
       }
     }

  else
# 230 "ep.c"
  if ((28) == (30))
    {

# 231 "ep.c"
      if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8)))
	{

# 233 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 235 "ep.c"
  if ((28) == (32))
    {

# 236 "ep.c"
      if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8)))
	{

# 238 "ep.c"
	  (verified) = (1);
	}
    }
# 242 "ep.c"
  (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0));
# 244 "ep.c"
  printf
    ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012",
     tm, 28, gc, sx, sy);
# 251 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 252 "ep.c"
      printf ("%3d %15.0f\012", i, *((q) + (i)));
    }
# 255 "ep.c"
  c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops,
		   "Random numbers generated", verified, "2.3", "07 Aug 2006",
		   "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm",
		   "randdp");
# 261 "ep.c"
  if ((0) == (1))
    {

# 262 "ep.c"
      printf ("Total time:     %f", timer_read (1));
# 263 "ep.c"
      printf ("Gaussian pairs: %f", timer_read (2));
# 264 "ep.c"
      printf ("Random numbers: %f", timer_read (3));
    }
}
Exemple #13
0
int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  // 9. Get the result
  DTIMER_START(T_BUFFER_READ);
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");
  DTIMER_STOP(T_BUFFER_READ);

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    }
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, 
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  free(gq);
  free(gsx);
  free(gsy);
  release_opencl();

  fflush(stdout);

  return 0;
}
Exemple #14
0
Fichier : cg.c Projet : 8l/rose
int main(int argc, char **argv) {
    int	i, j, k, it;
    int nthreads = 1;
    double zeta;
    double rnorm;
    double norm_temp11;
    double norm_temp12;
    double t, mflops;
    char cclass;
    boolean verified;
    double zeta_verify_value, epsilon;

    firstrow = 1;
    lastrow  = NA;
    firstcol = 1;
    lastcol  = NA;

    if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) {
	cclass = 'S';
	zeta_verify_value = 8.5971775078648;
    } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) {
	cclass = 'W';
	zeta_verify_value = 10.362595087124;
    } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) {
	cclass = 'A';
	zeta_verify_value = 17.130235054029;
    } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) {
	cclass = 'B';
	zeta_verify_value = 22.712745482631;
    } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) {
	cclass = 'C';
	zeta_verify_value = 28.973605592845;
    } else {
	cclass = 'U';
    }

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
           " - CG Benchmark\n");
    printf(" Size: %10d\n", NA);
    printf(" Iterations: %5d\n", NITER);

    naa = NA;
    nzz = NZ;

/*--------------------------------------------------------------------
c  Initialize random number generator
c-------------------------------------------------------------------*/
    tran    = 314159265.0;
    amult   = 1220703125.0;
    zeta    = randlc( &tran, amult );

/*--------------------------------------------------------------------
c  
c-------------------------------------------------------------------*/
    makea(naa, nzz, a, colidx, rowstr, NONZER,
          firstrow, lastrow, firstcol, lastcol, 
	  RCOND, arow, acol, aelt, v, iv, SHIFT);
    
/*---------------------------------------------------------------------
c  Note: as a result of the above call to makea:
c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
c        values of colidx which are col indexes go from firstcol --> lastcol
c        So:
c        Shift the col index vals from actual (firstcol --> lastcol ) 
c        to local, i.e., (1 --> lastcol-firstcol+1)
c---------------------------------------------------------------------*/
#pragma omp parallel private(it,i,j,k)
{	
#pragma omp for nowait
    for (j = 1; j <= lastrow - firstrow + 1; j++) {
	for (k = rowstr[j]; k < rowstr[j+1]; k++) {
            colidx[k] = colidx[k] - firstcol + 1;
	}
    }

/*--------------------------------------------------------------------
c  set starting vector to (1, 1, .... 1)
c-------------------------------------------------------------------*/
#pragma omp for nowait
    for (i = 1; i <= NA+1; i++) {
	x[i] = 1.0;
    }
#pragma omp single
    zeta  = 0.0;

/*-------------------------------------------------------------------
c---->
c  Do one iteration untimed to init all code and data page tables
c---->                    (then reinit, start timing, to niter its)
c-------------------------------------------------------------------*/

    for (it = 1; it <= 1; it++) {

/*--------------------------------------------------------------------
c  The call to the conjugate gradient routine:
c-------------------------------------------------------------------*/
	conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm);

/*--------------------------------------------------------------------
c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
c-------------------------------------------------------------------*/
#pragma omp single
{	
	norm_temp11 = 0.0;
	norm_temp12 = 0.0;
} /* end single */

#pragma omp for reduction(+:norm_temp11,norm_temp12)
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            norm_temp11 = norm_temp11 + x[j]*z[j];
            norm_temp12 = norm_temp12 + z[j]*z[j];
	}
#pragma omp single
	norm_temp12 = 1.0 / sqrt( norm_temp12 );

/*--------------------------------------------------------------------
c  Normalize z to obtain x
c-------------------------------------------------------------------*/
#pragma omp for
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            x[j] = norm_temp12*z[j];
	}
	
    } /* end of do one iteration untimed */

/*--------------------------------------------------------------------
c  set starting vector to (1, 1, .... 1)
c-------------------------------------------------------------------*/
#pragma omp for nowait
    for (i = 1; i <= NA+1; i++) {
         x[i] = 1.0;
    }
#pragma omp single    
    zeta  = 0.0;

} /* end parallel */

    timer_clear( 1 );
    timer_start( 1 );

/*--------------------------------------------------------------------
c---->
c  Main Iteration for inverse power method
c---->
c-------------------------------------------------------------------*/

#pragma omp parallel private(it,i,j,k)
{
    for (it = 1; it <= NITER; it++) {

/*--------------------------------------------------------------------
c  The call to the conjugate gradient routine:
c-------------------------------------------------------------------*/
	conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm);

/*--------------------------------------------------------------------
c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
c-------------------------------------------------------------------*/
#pragma omp single
{	
	norm_temp11 = 0.0;
	norm_temp12 = 0.0;
} /* end single */

#pragma omp for reduction(+:norm_temp11,norm_temp12)
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            norm_temp11 = norm_temp11 + x[j]*z[j];
            norm_temp12 = norm_temp12 + z[j]*z[j];
	}

#pragma omp single
{	
	norm_temp12 = 1.0 / sqrt( norm_temp12 );

	zeta = SHIFT + 1.0 / norm_temp11;
} /* end single */

#pragma omp master
{
	if( it == 1 ) {
            printf("   iteration           ||r||                 zeta\n");
	}
	printf("    %5d       %20.14e%20.13e\n", it, rnorm, zeta);
} /* end master */

/*--------------------------------------------------------------------
c  Normalize z to obtain x
c-------------------------------------------------------------------*/
#pragma omp for 
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            x[j] = norm_temp12*z[j];
	}
    } /* end of main iter inv pow meth */

#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */
} /* end parallel */

    timer_stop( 1 );

/*--------------------------------------------------------------------
c  End of timed section
c-------------------------------------------------------------------*/

    t = timer_read( 1 );

    printf(" Benchmark completed\n");

    epsilon = 1.0e-10;
    if (cclass != 'U') {
	if (fabs(zeta - zeta_verify_value) <= epsilon) {
            verified = TRUE;
	    printf(" VERIFICATION SUCCESSFUL\n");
	    printf(" Zeta is    %20.12e\n", zeta);
	    printf(" Error is   %20.12e\n", zeta - zeta_verify_value);
	} else {
            verified = FALSE;
	    printf(" VERIFICATION FAILED\n");
	    printf(" Zeta                %20.12e\n", zeta);
	    printf(" The correct zeta is %20.12e\n", zeta_verify_value);
	}
    } else {
	verified = FALSE;
	printf(" Problem size unknown\n");
	printf(" NO VERIFICATION PERFORMED\n");
    }

    if ( t != 0.0 ) {
	mflops = (2.0*NITER*NA)
	    * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 )
	    / t / 1000000.0;
    } else {
	mflops = 0.0;
    }

    c_print_results("CG", cclass, NA, 0, 0, NITER, nthreads, t, 
		    mflops, "          floating point", 
		    verified, NPBVERSION, COMPILETIME,
		    CS1, CS2, CS3, CS4, CS5, CS6, CS7);
}
Exemple #15
0
int main(int argc, char **argv) {

/*c-------------------------------------------------------------------
c-------------------------------------------------------------------*/

    int i, ierr;
      
/*------------------------------------------------------------------
c u0, u1, u2 are the main arrays in the problem. 
c Depending on the decomposition, these arrays will have different 
c dimensions. To accomodate all possibilities, we allocate them as 
c one-dimensional arrays and pass them to subroutines for different 
c views
c  - u0 contains the initial (transformed) initial condition
c  - u1 and u2 are working arrays
c  - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the
c    time evolution operator. 
c-----------------------------------------------------------------*/

/*--------------------------------------------------------------------
c Large arrays are in common so that they are allocated on the
c heap rather than the stack. This common block is not
c referenced directly anywhere else. Padding is to avoid accidental 
c cache problems, since all array sizes are powers of two.
c-------------------------------------------------------------------*/
    static dcomplex u0[NZ][NY][NX];
    static dcomplex pad1[3];
    static dcomplex u1[NZ][NY][NX];
    static dcomplex pad2[3];
    static dcomplex u2[NZ][NY][NX];
    static dcomplex pad3[3];
    static int indexmap[NZ][NY][NX];
    
    int iter;
    int nthreads = 1;
    double total_time, mflops;
    boolean verified;
    char cclass;

/*--------------------------------------------------------------------
c Run the entire problem once to make sure all data is touched. 
c This reduces variable startup costs, which is important for such a 
c short benchmark. The other NPB 2 implementations are similar. 
c-------------------------------------------------------------------*/
    for (i = 0; i < T_MAX; i++) {
	timer_clear(i);
    }
    setup();
#pragma omp parallel
 {
    compute_indexmap(indexmap, dims[2]);
#pragma omp single
   {
    compute_initial_conditions(u1, dims[0]);
    fft_init (dims[0][0]);
   }
    fft(1, u1, u0);
 } /* end parallel */

/*--------------------------------------------------------------------
c Start over from the beginning. Note that all operations must
c be timed, in contrast to other benchmarks. 
c-------------------------------------------------------------------*/
    for (i = 0; i < T_MAX; i++) {
	timer_clear(i);
    }

    timer_start(T_TOTAL);
    if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP);

#pragma omp parallel private(iter) firstprivate(niter)
  {
    compute_indexmap(indexmap, dims[2]);

#pragma omp single
   {
    compute_initial_conditions(u1, dims[0]);
    
    fft_init (dims[0][0]);
   }

    if (TIMERS_ENABLED == TRUE) {
#pragma omp master
      timer_stop(T_SETUP);
    }
    if (TIMERS_ENABLED == TRUE) {
#pragma omp master   
      timer_start(T_FFT);
    }
    fft(1, u1, u0);
    if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
      timer_stop(T_FFT);
    }

    for (iter = 1; iter <= niter; iter++) {
	if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_start(T_EVOLVE);
	}
	
	evolve(u0, u1, iter, indexmap, dims[0]);
	
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_stop(T_EVOLVE);
	}
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_start(T_FFT);
	}
	
        fft(-1, u1, u2);
	
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_stop(T_FFT);
	}
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_start(T_CHECKSUM);
	}
	
        checksum(iter, u2, dims[0]);
	
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_stop(T_CHECKSUM);
	}
    }
    
#pragma omp single
    verify(NX, NY, NZ, niter, &verified, &cclass);
    
#if defined(_OPENMP)
#pragma omp master    
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
  } /* end parallel */
  
    timer_stop(T_TOTAL);
    total_time = timer_read(T_TOTAL);

    if( total_time != 0.0) {
	mflops = 1.0e-6*(double)(NTOTAL) *
	    (14.8157+7.19641*log((double)(NTOTAL))
	     +  (5.23518+7.21113*log((double)(NTOTAL)))*niter)
	    /total_time;
    } else {
	mflops = 0.0;
    }
    c_print_results("FT", cclass, NX, NY, NZ, niter, nthreads,
		    total_time, mflops, "          floating point", verified, 
		    NPBVERSION, COMPILETIME,
		    CS1, CS2, CS3, CS4, CS5, CS6, CS7);
    if (TIMERS_ENABLED == TRUE) print_timers();
}