Beispiel #1
0
int main(int argc, char *argv[])
{
  int i;
  int iter;
  double total_time, mflops;
  logical verified;
  char Class;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  //---------------------------------------------------------------------
  // Run the entire problem once to make sure all data is touched. 
  // This reduces variable startup costs, which is important for such a 
  // short benchmark. The other NPB 2 implementations are similar. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }
  setup();
  setup_opencl(argc, argv);
  init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  fft_init(dims[0]);
  fft(1, &m_u1, &m_u0);

  //---------------------------------------------------------------------
  // Start over from the beginning. Note that all operations must
  // be timed, in contrast to other benchmarks. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }

  timer_start(T_total);
  if (timers_enabled) timer_start(T_setup);

  DTIMER_START(T_compute_im);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_im);

  DTIMER_START(T_compute_ics);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_ics);

  DTIMER_START(T_fft_init);
  fft_init(dims[0]);
  DTIMER_STOP(T_fft_init);

  if (timers_enabled) timer_stop(T_setup);
  if (timers_enabled) timer_start(T_fft);
  fft(1, &m_u1, &m_u0);
  if (timers_enabled) timer_stop(T_fft);

  for (iter = 1; iter <= niter; iter++) {
    if (timers_enabled) timer_start(T_evolve);
    evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_evolve);
    if (timers_enabled) timer_start(T_fft);
    fft(-1, &m_u1, &m_u1);
    if (timers_enabled) timer_stop(T_fft);
    if (timers_enabled) timer_start(T_checksum);
    checksum(iter, &m_u1, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_checksum);
  }

  verify(NX, NY, NZ, niter, &verified, &Class);

  timer_stop(T_total);
  total_time = timer_read(T_total);

  if (total_time != 0.0) {
    mflops = 1.0e-6 * (double)NTOTAL *
            (14.8157 + 7.19641 * log((double)NTOTAL)
            + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter)
            / total_time;
  } else {
    mflops = 0.0;
  }
  c_print_results("FT", Class, NX, NY, NZ, niter,
                  total_time, mflops, "          floating point", verified, 
                  NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7,
                  clu_GetDeviceTypeName(device_type),
                  device_name);
  if (timers_enabled) print_timers();

  release_opencl();

  fflush(stdout);

  return 0;
}
Beispiel #2
0
int main(int argc, char **argv)
#endif
{
  int i, niter, step;
  double mflops, t, tmax;
  logical verified;
  char class;
  double tsum[t_last+2], t1[t_last+2],
         tming[t_last+2], tmaxg[t_last+2];
  char *t_recs[t_last+2] = {
       "total", "rhs", "xsolve", "ysolve", "zsolve", 
       "bpack", "exch", "xcomm", "ycomm", "zcomm",
       " totcomp", " totcomm" };

  //---------------------------------------------------------------------
  // Root node reads input file (if it exists) else takes
  // defaults from parameters
  //---------------------------------------------------------------------
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL-MD) - SP Benchmark\n\n");

  FILE *fp;
  fp = fopen("timer.flag", "r");
  timeron = false;
  if (fp != NULL) {
    timeron = true;
    fclose(fp);
  }

  if ((fp = fopen("inputsp.data", "r")) != NULL) {
    int result;
    printf(" Reading from input file inputsp.data\n");
    result = fscanf(fp, "%d", &niter);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%*f");
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d%d%d", &grid_points[0], &grid_points[1], 
                                  &grid_points[2]);
    fclose(fp);
  } else {
    printf(" No input file inputsp.data. Using compiled defaults\n");
    niter = NITER_DEFAULT;
    grid_points[0] = PROBLEM_SIZE;
    grid_points[1] = PROBLEM_SIZE;
    grid_points[2] = PROBLEM_SIZE;
  }

  setup_opencl(argc, argv);

  printf(" Size: %4dx%4dx%4d\n", 
      grid_points[0], grid_points[1], grid_points[2]);
  printf(" Iterations: %4d", niter);
  if (num_devices != MAXCELLS*MAXCELLS) 
    printf(" WARNING: compiled for %5d devices \n", MAXCELLS*MAXCELLS);
  printf(" Number of active devices: %5d\n\n", num_devices);

  make_set();

  for (i = 0; i < t_last; i++) {
    timer_clear(i);
  }

  set_constants();

  initialize();

  lhsinit();

  exact_rhs();

  compute_buffer_size(5);

  set_kernel_args();

  //---------------------------------------------------------------------
  // do one time step to touch all code, and reinitialize
  //---------------------------------------------------------------------
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
  // set cmd queue property
  for(i = 0; i < num_devices; i++) {
  	clSetCommandQueueProperty(cmd_queue[i], 
			CL_QUEUE_AUTO_DEVICE_SELECTION | 
			//CL_QUEUE_ITERATIVE | 
			CL_QUEUE_COMPUTE_INTENSIVE,
			true,
			NULL);
  }
#endif
  adi();
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
  for(i = 0; i < num_devices; i++) {
  	clSetCommandQueueProperty(cmd_queue[i], 
			0,
			true,
			NULL);
  }
#endif

  initialize();

  //---------------------------------------------------------------------
  // Synchronize before placing time stamp
  //---------------------------------------------------------------------
  for (i = 0; i < t_last; i++) {
    timer_clear(i);
  }

  timer_clear(0);
  timer_start(0);

  for (step = 1; step <= niter; step++) {

    if ((step % 20) == 0 || step == 1) {
      printf(" Time step %4d\n", step);
    }

    adi();

  }

  timer_stop(0);
  t = timer_read(0);

  verify(niter, &class, &verified);

  tmax = t;

  if( tmax != 0.0 ) {
    mflops = (881.174*(double)( PROBLEM_SIZE*PROBLEM_SIZE*PROBLEM_SIZE )
             -4683.91*(double)( PROBLEM_SIZE*PROBLEM_SIZE )
             +11484.5*(double)( PROBLEM_SIZE )
             -19272.4) * (double)( niter ) / (tmax*1000000.0);
  } else {
    mflops = 0.0;
  }

  c_print_results("SP", class, grid_points[0], 
      grid_points[1], grid_points[2], niter,
      tmax, mflops, "          floating point", 
      verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
      CS6, CS7, clu_GetDeviceTypeName(device_type), device_name, num_devices);

  if (timeron) {
/*
    for (i = 0; i < t_last; i++) {
      t1[i] = timer_read(i);
    }
    t1[t_xsolve] = t1[t_xsolve] - t1[t_xcomm];
    t1[t_ysolve] = t1[t_ysolve] - t1[t_ycomm];
    t1[t_zsolve] = t1[t_zsolve] - t1[t_zcomm];
    t1[t_last+2] = t1[t_xcomm]+t1[t_ycomm]+t1[t_zcomm]+t1[t_exch];
    t1[t_last+1] = t1[t_total]  - t1[t_last+2];

    MPI_Reduce(&t1, tsum,  t_last+2, dp_type, MPI_SUM, 0, comm_setup);
    MPI_Reduce(&t1, tming, t_last+2, dp_type, MPI_MIN, 0, comm_setup);
    MPI_Reduce(&t1, tmaxg, t_last+2, dp_type, MPI_MAX, 0, comm_setup);

    if (node == 0) {
      printf(" nprocs =%6d           minimum     maximum     average\n",
          total_nodes);
      for (i = 0; i < t_last+2; i++) {
        tsum[i] = tsum[i] / total_nodes;
          printf(" timer %2d(%8s) :  %10.4f  %10.4f  %10.4f\n",
              i+1, t_recs[i], tming[i], tmaxg[i], tsum[i]);
      }
    }
*/
  }

  release_opencl();

  return 0;
}
Beispiel #3
0
int main( int argc, char **argv )
{

  int             i, iteration;

  double          timecounter;

  FILE            *fp;

  cl_int ecode;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  /*  Initialize timers  */
  timer_on = 0;            
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    fclose(fp);
    timer_on = 1;
  }
  timer_clear( 0 );
  if (timer_on) {
    timer_clear( 1 );
    timer_clear( 2 );
    timer_clear( 3 );
  }

  if (timer_on) timer_start( 3 );

  /*  Initialize the verification arrays if a valid class */
  for( i=0; i<TEST_ARRAY_SIZE; i++ )
    switch( CLASS )
    {
      case 'S':
        test_index_array[i] = S_test_index_array[i];
        test_rank_array[i]  = S_test_rank_array[i];
        break;
      case 'A':
        test_index_array[i] = A_test_index_array[i];
        test_rank_array[i]  = A_test_rank_array[i];
        break;
      case 'W':
        test_index_array[i] = W_test_index_array[i];
        test_rank_array[i]  = W_test_rank_array[i];
        break;
      case 'B':
        test_index_array[i] = B_test_index_array[i];
        test_rank_array[i]  = B_test_rank_array[i];
        break;
      case 'C':
        test_index_array[i] = C_test_index_array[i];
        test_rank_array[i]  = C_test_rank_array[i];
        break;
      case 'D':
        test_index_array[i] = D_test_index_array[i];
        test_rank_array[i]  = D_test_rank_array[i];
        break;
    };

  /* set up the OpenCL environment. */
  setup_opencl(argc, argv);

  /*  Printout initial NPB info */
  printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" );
  printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
  printf( " Iterations:   %d\n", MAX_ITERATIONS );

  if (timer_on) timer_start( 1 );

  /*  Generate random number sequence and subsequent keys on all procs */
  create_seq( 314159265.00,                    /* Random number gen seed */
              1220703125.00 );                 /* Random number gen mult */
  if (timer_on) timer_stop( 1 );

  /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
      all data and code pages and respective tables */
  rank( 1 );  

  /*  Start verification counter */
  passed_verification = 0;

  DTIMER_START(T_BUFFER_WRITE);
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_passed_verification,
                               CL_TRUE,
                               0,
                               sizeof(cl_int),
                               &passed_verification,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_WRITE);

  if( CLASS != 'S' ) printf( "\n   iteration\n" );

  /*  Start timer  */             
  timer_start( 0 );


  /*  This is the main iteration */
  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
  {
    if( CLASS != 'S' ) printf( "        %d\n", iteration );
    rank( iteration );
  }

  DTIMER_START(T_BUFFER_READ);
  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_passed_verification,
                              CL_TRUE,
                              0,
                              sizeof(cl_int),
                              &passed_verification,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_READ);

  /*  End of timing, obtain maximum time of all processors */
  timer_stop( 0 );
  timecounter = timer_read( 0 );


  /*  This tests that keys are in sequence: sorting of last ranked key seq
      occurs here, but is an untimed operation                             */
  if (timer_on) timer_start( 2 );
  full_verify();
  if (timer_on) timer_stop( 2 );

  if (timer_on) timer_stop( 3 );


  /*  The final printout  */
  if( passed_verification != 5*MAX_ITERATIONS + 1 )
    passed_verification = 0;
  c_print_results( "IS",
                   CLASS,
                   (int)(TOTAL_KEYS/64),
                   64,
                   0,
                   MAX_ITERATIONS,
                   timecounter,
                   ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                              /timecounter/1000000.,
                   "keys ranked", 
                   passed_verification,
                   NPBVERSION,
                   COMPILETIME,
                   CC,
                   CLINK,
                   C_LIB,
                   C_INC,
                   CFLAGS,
                   CLINKFLAGS,
                   "",
                   clu_GetDeviceTypeName(device_type),
                   device_name);

  /*  Print additional timers  */
  if (timer_on) {
    double t_total, t_percent;

    t_total = timer_read( 3 );
    printf("\nAdditional timers -\n");
    printf(" Total execution: %8.3f\n", t_total);
    if (t_total == 0.0) t_total = 1.0;
    timecounter = timer_read(1);
    t_percent = timecounter/t_total * 100.;
    printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(0);
    t_percent = timecounter/t_total * 100.;
    printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(2);
    t_percent = timecounter/t_total * 100.;
    printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
  }

  release_opencl();
  
  fflush(stdout);

  return 0;
  /**************************/
} /*  E N D  P R O G R A M  */
Beispiel #4
0
int main(int argc, char *argv[])
{
  char Class;
  logical verified;
  double mflops;

  double t, tmax, trecs[t_last+1];
  int i;
  char *t_names[t_last+1];

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  //---------------------------------------------------------------------
  // Setup info for timers
  //---------------------------------------------------------------------
  FILE *fp;
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    timeron = true;
    t_names[t_total] = "total";
    t_names[t_rhsx] = "rhsx";
    t_names[t_rhsy] = "rhsy";
    t_names[t_rhsz] = "rhsz";
    t_names[t_rhs] = "rhs";
    t_names[t_jacld] = "jacld";
    t_names[t_blts] = "blts";
    t_names[t_jacu] = "jacu";
    t_names[t_buts] = "buts";
    t_names[t_add] = "add";
    t_names[t_l2norm] = "l2norm";

    t_names[t_setbv] = "setbv";
    t_names[t_setiv] = "setiv";
    t_names[t_erhs] = "erhs";
    t_names[t_error] = "error";
    t_names[t_pintgr] = "pintgr";
    t_names[t_blts1] = "blts1";
    t_names[t_buts1] = "buts1";
    fclose(fp);
  } else {
    timeron = false;
  }

  //---------------------------------------------------------------------
  // read input data
  //---------------------------------------------------------------------
  read_input();

  //---------------------------------------------------------------------
  // set up domain sizes
  //---------------------------------------------------------------------
  domain();

  //---------------------------------------------------------------------
  // set up OpenCL environment
  //---------------------------------------------------------------------
  setup_opencl(argc, argv);

  //---------------------------------------------------------------------
  // set up coefficients
  //---------------------------------------------------------------------
  setcoeff();

  //---------------------------------------------------------------------
  // set the boundary values for dependent variables
  //---------------------------------------------------------------------
  setbv();

  //---------------------------------------------------------------------
  // set the initial values for dependent variables
  //---------------------------------------------------------------------
  setiv();

  //---------------------------------------------------------------------
  // compute the forcing term based on prescribed exact solution
  //---------------------------------------------------------------------
  erhs();

  //---------------------------------------------------------------------
  // perform one SSOR iteration to touch all data pages
  //---------------------------------------------------------------------
  ssor(1);

  //---------------------------------------------------------------------
  // reset the boundary and initial values
  //---------------------------------------------------------------------
  setbv();
  setiv();

  //---------------------------------------------------------------------
  // perform the SSOR iterations
  //---------------------------------------------------------------------
  ssor(itmax);

  //---------------------------------------------------------------------
  // compute the solution error
  //---------------------------------------------------------------------
  error();

  //---------------------------------------------------------------------
  // compute the surface integral
  //---------------------------------------------------------------------
  pintgr();

  //---------------------------------------------------------------------
  // verification test
  //---------------------------------------------------------------------
  verify ( rsdnm, errnm, frc, &Class, &verified );
  mflops = (double)itmax * (1984.77 * (double)nx0
      * (double)ny0
      * (double)nz0
      - 10923.3 * pow(((double)(nx0+ny0+nz0)/3.0), 2.0) 
      + 27770.9 * (double)(nx0+ny0+nz0)/3.0
      - 144010.0)
    / (maxtime*1000000.0);

  c_print_results("LU", Class, nx0,
                  ny0, nz0, itmax,
                  maxtime, mflops, "          floating point", verified, 
                  NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, 
                  "(none)",
                  clu_GetDeviceTypeName(device_type),
                  device_name);

  //---------------------------------------------------------------------
  // More timers
  //---------------------------------------------------------------------
  if (timeron) {
    for (i = 1; i <= t_last; i++) {
      trecs[i] = timer_read(i);
    }
    tmax = maxtime;
    if (tmax == 0.0) tmax = 1.0;

    printf("  SECTION     Time (secs)\n");
    for (i = 1; i <= t_last; i++) {
      printf("  %-8s:%9.4f  (%6.2f%%)\n",
          t_names[i], trecs[i], trecs[i]*100./tmax);
      if (i == t_rhs) {
        t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz];
        printf("     --> %8s:%9.3f  (%6.2f%%)\n", "sub-rhs", t, t*100./tmax);
        t = trecs[i] - t;
        printf("     --> %8s:%9.3f  (%6.2f%%)\n", "rest-rhs", t, t*100./tmax);
      }
    }
  }

  release_opencl();

  fflush(stdout);

  return 0;
}
Beispiel #5
0
int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  // 9. Get the result
  DTIMER_START(T_BUFFER_READ);
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");
  DTIMER_STOP(T_BUFFER_READ);

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    }
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, 
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  free(gq);
  free(gsx);
  free(gsy);
  release_opencl();

  fflush(stdout);

  return 0;
}