Esempio n. 1
0
static int encode_ogg (cdrom_drive *drive, rip_opts_s *rip_opts,
		       text_tag_s *text_tag, int track,
		       int tracktot, char *filename, char **filenames)
{
  ogg_stream_state os;
  ogg_page og;
  ogg_packet op;

  vorbis_dsp_state vd;
  vorbis_block vb;
  vorbis_info vi;

  long samplesdone = 0;
  int sector = 0, last_sector = 0;
  long bytes_written = 0, packetsdone = 0;
  double time_elapsed = 0.0;
  int ret = 0;
  time_t *timer;
  double time;
  
  int serialno = rand ();
  vorbis_comment vc;
  long total_samples_per_channel = 0;
  int channels = 2;
  int eos = 0;
  long rate = 44100;
  FILE *out = fopen (filename, "w+");

  timer = timer_start ();

  if (!rip_opts->managed && (rip_opts->min_bitrate > 0 || rip_opts->max_bitrate > 0)) {
    log_msg ("Min or max bitrate requires managed", FL, FN, LN);
    return -1;
  }

  if (rip_opts->bitrate < 0 && rip_opts->min_bitrate < 0 && rip_opts->max_bitrate < 0) {
    rip_opts->quality_set = 1;
  }
  
  start_func (filename, rip_opts->bitrate, rip_opts->quality, rip_opts->quality_set,
	      rip_opts->managed, rip_opts->min_bitrate, rip_opts->max_bitrate);
  
  vorbis_info_init (&vi);

  if (rip_opts->quality_set > 0) {
    if (vorbis_encode_setup_vbr (&vi, channels, rate, rip_opts->quality)) {
      log_msg ("Couldn't initialize vorbis_info", FL, FN, LN);
      vorbis_info_clear (&vi);
      return -1;
    }
    /* two options here, max or min bitrate */
    if (rip_opts->max_bitrate > 0 || rip_opts->min_bitrate > 0) {
      struct ovectl_ratemanage_arg ai;
      vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_GET, &ai);
      ai.bitrate_hard_min = rip_opts->min_bitrate;
      ai.bitrate_hard_max = rip_opts->max_bitrate;
      ai.management_active = 1;
      vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_SET, &ai);
    }
  } else {
    if (vorbis_encode_setup_managed (&vi, channels, rate,
				     rip_opts->max_bitrate > 0 ? rip_opts->max_bitrate * 1000 : -1,
				     rip_opts->bitrate * 1000,
				     rip_opts->min_bitrate > 0 ? rip_opts->min_bitrate * 1000 : -1)) {
      log_msg ("Mode init failed, encode setup managed", FL, FN, LN);
      vorbis_info_clear (&vi);
      return -1;
    }
  }

  if (rip_opts->managed && rip_opts->bitrate < 0) {
    vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_AVG, NULL);
  } else if (!rip_opts->managed) {
    vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_SET, NULL);
  }

  /* set advanced encoder options */

  vorbis_encode_setup_init (&vi);

  vorbis_analysis_init (&vd, &vi);
  vorbis_block_init (&vd, &vb);

  ogg_stream_init (&os, serialno);

  {
    ogg_packet header_main;
    ogg_packet header_comments;
    ogg_packet header_codebooks;
    int result;
    char buf[32];

    vorbis_comment_init (&vc);
    vorbis_comment_add_tag (&vc, "title", text_tag->songname);
    vorbis_comment_add_tag (&vc, "artist", text_tag->artistname);
    vorbis_comment_add_tag (&vc, "album", text_tag->albumname);
    vorbis_comment_add_tag (&vc, "genre", text_tag->genre);
    snprintf (buf, 32, "%d", text_tag->year);
    vorbis_comment_add_tag (&vc, "date", buf);
    snprintf (buf, 32, "%02d", text_tag->track);
    vorbis_comment_add_tag (&vc, "tracknumber", buf);
	
    vorbis_analysis_headerout (&vd, &vc, &header_main, &header_comments, &header_codebooks);

    ogg_stream_packetin (&os, &header_main);
    ogg_stream_packetin (&os, &header_comments);
    ogg_stream_packetin (&os, &header_codebooks);

    while ((result = ogg_stream_flush (&os, &og))) {
      if (result == 0)
	break;
      ret = write_page (&og, out);
      if (ret != og.header_len + og.body_len) {
	log_msg ("Failed writing data to output stream", FL, FN, LN);
	ret = -1;
      }
    }
	  
    sector = cdda_track_firstsector (drive, track);
    last_sector = cdda_track_lastsector (drive, track);
    total_samples_per_channel = (last_sector - sector) * (CD_FRAMESAMPLES / 2);
    int eos = 0;
	
    while (!eos) {
      signed char *buffer = (signed char *)malloc (CD_FRAMESIZE_RAW * READ_SECTORS);
      //use this variable as a s**t
      long sectors_read = last_sector - sector;
      if (sectors_read > READ_SECTORS)
	sectors_read = READ_SECTORS;

      sectors_read = cdda_read (drive, (signed char *)buffer, sector, sectors_read);
      int i;
	  
      if (sectors_read == 0) {
	vorbis_analysis_wrote (&vd, 0);
      } else {
	float **vorbbuf = vorbis_analysis_buffer (&vd, CD_FRAMESIZE_RAW * sectors_read);
	for (i = 0; i < (CD_FRAMESIZE_RAW * sectors_read) / 4; i++) {
	  vorbbuf[0][i] = ((buffer[i * 4 + 1] << 8) | (0x00ff&(int)buffer[i * 4])) / 32768.f;
	  vorbbuf[1][i] = ((buffer[i * 4 + 3] << 8) | (0x00ff&(int)buffer[i * 4 + 2])) / 32768.f;
	}

	int samples_read = sectors_read * (CD_FRAMESAMPLES / 2);
	samplesdone += samples_read;
	// progress every 60 pages
	if (packetsdone >= 60) {
	  packetsdone = 0;
	  time = timer_time (timer);
	  update_statistics (total_samples_per_channel, samplesdone, time, track,
			     tracktot, 0, filenames);
	}
	vorbis_analysis_wrote (&vd, i);
      }
	  
      free (buffer);
      sector += sectors_read;
	  
      while (vorbis_analysis_blockout (&vd, &vb) == 1) {
	vorbis_analysis (&vb, &op);
	vorbis_bitrate_addblock (&vb);

	while (vorbis_bitrate_flushpacket (&vd, &op)) {
	  ogg_stream_packetin (&os, &op);
	  packetsdone++;

	  while (!eos) {
	    int result = ogg_stream_pageout (&os, &og);
	    if (result == 0) {
	      break;
	    }
	    ret = write_page (&og, out);
	    if (ret != og.header_len + og.body_len) {
	      log_msg ("Failed writing data to output stream", FL, FN, LN);
	      ret = -1;
	    } else
	      bytes_written += ret;

	    if (ogg_page_eos (&og)) {
	      eos = 1;
	    }
	  }
	}
      }
    }
  }
  ret = 0;

  update_statistics (total_samples_per_channel, samplesdone, time, track,
		     tracktot, 0, filenames);
  
  ogg_stream_clear (&os);
  vorbis_block_clear (&vb);
  vorbis_dsp_clear (&vd);
  vorbis_comment_clear (&vc);
  vorbis_info_clear (&vi);
  vorbis_comment_clear (&vc);
  time_elapsed = timer_time (timer);
  end_func (time_elapsed, rate, samplesdone, bytes_written);
  timer_clear (timer);
  fclose (out);
  
  return ret;
}
Esempio n. 2
0
void hwtimer_arch_unset(short timer)
{
    timer_clear(hw_timers[timer/2], (timer%2));
}
Esempio n. 3
0
static int realmain(void *carg)
{
    unsigned arg = (uintptr_t)carg;

/*c-------------------------------------------------------------------
c-------------------------------------------------------------------*/

    int i, ierr;
      
/*------------------------------------------------------------------
c u0, u1, u2 are the main arrays in the problem. 
c Depending on the decomposition, these arrays will have different 
c dimensions. To accomodate all possibilities, we allocate them as 
c one-dimensional arrays and pass them to subroutines for different 
c views
c  - u0 contains the initial (transformed) initial condition
c  - u1 and u2 are working arrays
c  - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the
c    time evolution operator. 
c-----------------------------------------------------------------*/

/*--------------------------------------------------------------------
c Large arrays are in common so that they are allocated on the
c heap rather than the stack. This common block is not
c referenced directly anywhere else. Padding is to avoid accidental 
c cache problems, since all array sizes are powers of two.
c-------------------------------------------------------------------*/
    static dcomplex u0[NZ][NY][NX];
    static dcomplex pad1[3];
    static dcomplex u1[NZ][NY][NX];
    static dcomplex pad2[3];
    static dcomplex u2[NZ][NY][NX];
    static dcomplex pad3[3];
    static int indexmap[NZ][NY][NX];
    
    int iter;
    int nthreads = 1;
    double total_time, mflops;
    boolean verified;
    char class;

    omp_set_num_threads(arg);
/*--------------------------------------------------------------------
c Run the entire problem once to make sure all data is touched. 
c This reduces variable startup costs, which is important for such a 
c short benchmark. The other NPB 2 implementations are similar. 
c-------------------------------------------------------------------*/
    for (i = 0; i < T_MAX; i++) {
	timer_clear(i);
    }
    setup();
#pragma omp parallel
 {
    compute_indexmap(indexmap, dims[2]);
#pragma omp single
   {
    compute_initial_conditions(u1, dims[0]);
    fft_init (dims[0][0]);
   }
    fft(1, u1, u0);
 } /* end parallel */

/*--------------------------------------------------------------------
c Start over from the beginning. Note that all operations must
c be timed, in contrast to other benchmarks. 
c-------------------------------------------------------------------*/
    for (i = 0; i < T_MAX; i++) {
	timer_clear(i);
    }

    timer_start(T_TOTAL);
    if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP);

#pragma omp parallel private(iter) firstprivate(niter)
  {
    compute_indexmap(indexmap, dims[2]);

#pragma omp single
   {
    compute_initial_conditions(u1, dims[0]);
    
    fft_init (dims[0][0]);
   }

    if (TIMERS_ENABLED == TRUE) {
#pragma omp master
      timer_stop(T_SETUP);
    }
    if (TIMERS_ENABLED == TRUE) {
#pragma omp master   
      timer_start(T_FFT);
    }
    fft(1, u1, u0);
    if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
      timer_stop(T_FFT);
    }

    for (iter = 1; iter <= niter; iter++) {
	if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_start(T_EVOLVE);
	}
	
	evolve(u0, u1, iter, indexmap, dims[0]);
	
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_stop(T_EVOLVE);
	}
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_start(T_FFT);
	}
	
        fft(-1, u1, u2);
	
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_stop(T_FFT);
	}
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_start(T_CHECKSUM);
	}
	
        checksum(iter, u2, dims[0]);
	
        if (TIMERS_ENABLED == TRUE) {
#pragma omp master      
	  timer_stop(T_CHECKSUM);
	}
    }
    
#pragma omp single
    verify(NX, NY, NZ, niter, &verified, &class);
    
#if defined(_OPENMP)
#pragma omp master    
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
  } /* end parallel */
  
    timer_stop(T_TOTAL);
    total_time = timer_read(T_TOTAL);

    if( total_time != 0.0) {
	mflops = 1.0e-6*(double)(NTOTAL) *
	    (14.8157+7.19641*log((double)(NTOTAL))
	     +  (5.23518+7.21113*log((double)(NTOTAL)))*niter)
	    /total_time;
    } else {
	mflops = 0.0;
    }
#ifdef BOMP
backend_create_time(arg);
#endif
printf("Computetime %d %f\n", arg, total_time);
printf("client done\n");
/*     c_print_results("FT", class, NX, NY, NZ, niter, nthreads, */
/* 		    total_time, mflops, "          floating point", verified,  */
/* 		    NPBVERSION, COMPILETIME, */
/* 		    CS1, CS2, CS3, CS4, CS5, CS6, CS7); */
    if (TIMERS_ENABLED == TRUE) print_timers();
}
Esempio n. 4
0
int main(int argc, char *argv[])
{
  int i;
  int iter;
  double total_time, mflops;
  logical verified;
  char Class;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  //---------------------------------------------------------------------
  // Run the entire problem once to make sure all data is touched. 
  // This reduces variable startup costs, which is important for such a 
  // short benchmark. The other NPB 2 implementations are similar. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }
  setup();
  setup_opencl(argc, argv);
  init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  fft_init(dims[0]);
  fft(1, &m_u1, &m_u0);

  //---------------------------------------------------------------------
  // Start over from the beginning. Note that all operations must
  // be timed, in contrast to other benchmarks. 
  //---------------------------------------------------------------------
  for (i = 1; i <= T_max; i++) {
    timer_clear(i);
  }

  timer_start(T_total);
  if (timers_enabled) timer_start(T_setup);

  DTIMER_START(T_compute_im);
  compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_im);

  DTIMER_START(T_compute_ics);
  compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]);
  DTIMER_STOP(T_compute_ics);

  DTIMER_START(T_fft_init);
  fft_init(dims[0]);
  DTIMER_STOP(T_fft_init);

  if (timers_enabled) timer_stop(T_setup);
  if (timers_enabled) timer_start(T_fft);
  fft(1, &m_u1, &m_u0);
  if (timers_enabled) timer_stop(T_fft);

  for (iter = 1; iter <= niter; iter++) {
    if (timers_enabled) timer_start(T_evolve);
    evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_evolve);
    if (timers_enabled) timer_start(T_fft);
    fft(-1, &m_u1, &m_u1);
    if (timers_enabled) timer_stop(T_fft);
    if (timers_enabled) timer_start(T_checksum);
    checksum(iter, &m_u1, dims[0], dims[1], dims[2]);
    if (timers_enabled) timer_stop(T_checksum);
  }

  verify(NX, NY, NZ, niter, &verified, &Class);

  timer_stop(T_total);
  total_time = timer_read(T_total);

  if (total_time != 0.0) {
    mflops = 1.0e-6 * (double)NTOTAL *
            (14.8157 + 7.19641 * log((double)NTOTAL)
            + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter)
            / total_time;
  } else {
    mflops = 0.0;
  }
  c_print_results("FT", Class, NX, NY, NZ, niter,
                  total_time, mflops, "          floating point", verified, 
                  NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7,
                  clu_GetDeviceTypeName(device_type),
                  device_name);
  if (timers_enabled) print_timers();

  release_opencl();

  fflush(stdout);

  return 0;
}
Esempio n. 5
0
int main(int argc, char **argv)
#endif
{
  int i, niter, step;
  double mflops, t, tmax;
  logical verified;
  char class;
  double tsum[t_last+2], t1[t_last+2],
         tming[t_last+2], tmaxg[t_last+2];
  char *t_recs[t_last+2] = {
       "total", "rhs", "xsolve", "ysolve", "zsolve", 
       "bpack", "exch", "xcomm", "ycomm", "zcomm",
       " totcomp", " totcomm" };

  //---------------------------------------------------------------------
  // Root node reads input file (if it exists) else takes
  // defaults from parameters
  //---------------------------------------------------------------------
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL-MD) - SP Benchmark\n\n");

  FILE *fp;
  fp = fopen("timer.flag", "r");
  timeron = false;
  if (fp != NULL) {
    timeron = true;
    fclose(fp);
  }

  if ((fp = fopen("inputsp.data", "r")) != NULL) {
    int result;
    printf(" Reading from input file inputsp.data\n");
    result = fscanf(fp, "%d", &niter);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%*f");
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d%d%d", &grid_points[0], &grid_points[1], 
                                  &grid_points[2]);
    fclose(fp);
  } else {
    printf(" No input file inputsp.data. Using compiled defaults\n");
    niter = NITER_DEFAULT;
    grid_points[0] = PROBLEM_SIZE;
    grid_points[1] = PROBLEM_SIZE;
    grid_points[2] = PROBLEM_SIZE;
  }

  setup_opencl(argc, argv);

  printf(" Size: %4dx%4dx%4d\n", 
      grid_points[0], grid_points[1], grid_points[2]);
  printf(" Iterations: %4d", niter);
  if (num_devices != MAXCELLS*MAXCELLS) 
    printf(" WARNING: compiled for %5d devices \n", MAXCELLS*MAXCELLS);
  printf(" Number of active devices: %5d\n\n", num_devices);

  make_set();

  for (i = 0; i < t_last; i++) {
    timer_clear(i);
  }

  set_constants();

  initialize();

  lhsinit();

  exact_rhs();

  compute_buffer_size(5);

  set_kernel_args();

  //---------------------------------------------------------------------
  // do one time step to touch all code, and reinitialize
  //---------------------------------------------------------------------
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
  // set cmd queue property
  for(i = 0; i < num_devices; i++) {
  	clSetCommandQueueProperty(cmd_queue[i], 
			CL_QUEUE_AUTO_DEVICE_SELECTION | 
			//CL_QUEUE_ITERATIVE | 
			CL_QUEUE_COMPUTE_INTENSIVE,
			true,
			NULL);
  }
#endif
  adi();
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
  for(i = 0; i < num_devices; i++) {
  	clSetCommandQueueProperty(cmd_queue[i], 
			0,
			true,
			NULL);
  }
#endif

  initialize();

  //---------------------------------------------------------------------
  // Synchronize before placing time stamp
  //---------------------------------------------------------------------
  for (i = 0; i < t_last; i++) {
    timer_clear(i);
  }

  timer_clear(0);
  timer_start(0);

  for (step = 1; step <= niter; step++) {

    if ((step % 20) == 0 || step == 1) {
      printf(" Time step %4d\n", step);
    }

    adi();

  }

  timer_stop(0);
  t = timer_read(0);

  verify(niter, &class, &verified);

  tmax = t;

  if( tmax != 0.0 ) {
    mflops = (881.174*(double)( PROBLEM_SIZE*PROBLEM_SIZE*PROBLEM_SIZE )
             -4683.91*(double)( PROBLEM_SIZE*PROBLEM_SIZE )
             +11484.5*(double)( PROBLEM_SIZE )
             -19272.4) * (double)( niter ) / (tmax*1000000.0);
  } else {
    mflops = 0.0;
  }

  c_print_results("SP", class, grid_points[0], 
      grid_points[1], grid_points[2], niter,
      tmax, mflops, "          floating point", 
      verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
      CS6, CS7, clu_GetDeviceTypeName(device_type), device_name, num_devices);

  if (timeron) {
/*
    for (i = 0; i < t_last; i++) {
      t1[i] = timer_read(i);
    }
    t1[t_xsolve] = t1[t_xsolve] - t1[t_xcomm];
    t1[t_ysolve] = t1[t_ysolve] - t1[t_ycomm];
    t1[t_zsolve] = t1[t_zsolve] - t1[t_zcomm];
    t1[t_last+2] = t1[t_xcomm]+t1[t_ycomm]+t1[t_zcomm]+t1[t_exch];
    t1[t_last+1] = t1[t_total]  - t1[t_last+2];

    MPI_Reduce(&t1, tsum,  t_last+2, dp_type, MPI_SUM, 0, comm_setup);
    MPI_Reduce(&t1, tming, t_last+2, dp_type, MPI_MIN, 0, comm_setup);
    MPI_Reduce(&t1, tmaxg, t_last+2, dp_type, MPI_MAX, 0, comm_setup);

    if (node == 0) {
      printf(" nprocs =%6d           minimum     maximum     average\n",
          total_nodes);
      for (i = 0; i < t_last+2; i++) {
        tsum[i] = tsum[i] / total_nodes;
          printf(" timer %2d(%8s) :  %10.4f  %10.4f  %10.4f\n",
              i+1, t_recs[i], tming[i], tmaxg[i], tsum[i]);
      }
    }
*/
  }

  release_opencl();

  return 0;
}
Esempio n. 6
0
void hwtimer_arch_unset(short timer)
{
    timer_clear(HW_TIMER, timer);
}
Esempio n. 7
0
int main(int argc,char **argv ){
  int my_rank,comm_size;
  int i;
  DGraph *dg=NULL;
  int verified=0, featnum=0;
  double bytes_sent=2.0,tot_time=0.0;



    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
    TRACE_smpi_set_category ("begin");

     if(argc!=2||
                (  strncmp(argv[1],"BH",2)!=0
                 &&strncmp(argv[1],"WH",2)!=0
                 &&strncmp(argv[1],"SH",2)!=0
                )
      ){
      if(my_rank==0){
        fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
        fprintf(stderr,"** Where \n   - N is integer number of MPI processes\n");
        fprintf(stderr,"   - S is the class S, W, or A \n");
        fprintf(stderr,"   - GraphName is the communication graph name BH, WH, or SH.\n");
        fprintf(stderr,"   - the number of MPI processes N should not be be less than \n");
        fprintf(stderr,"     the number of nodes in the graph\n");
      }
      MPI_Finalize();
      exit(0);
    } 
   if(strncmp(argv[1],"BH",2)==0){
      dg=buildBH(CLASS);
    }else if(strncmp(argv[1],"WH",2)==0){
      dg=buildWH(CLASS);
    }else if(strncmp(argv[1],"SH",2)==0){
      dg=buildSH(CLASS);
    }

    if(timer_on&&dg->numNodes+1>timers_tot){
      timer_on=0;
      if(my_rank==0)
        fprintf(stderr,"Not enough timers. Node timeing is off. \n");
    }
    if(dg->numNodes>comm_size){
      if(my_rank==0){
        fprintf(stderr,"**  The number of MPI processes should not be less than \n");
        fprintf(stderr,"**  the number of nodes in the graph\n");
        fprintf(stderr,"**  Number of MPI processes = %d\n",comm_size);
        fprintf(stderr,"**  Number nodes in the graph = %d\n",dg->numNodes);
      }
      MPI_Finalize();
      exit(0);
    }
    for(i=0;i<dg->numNodes;i++){ 
      dg->node[i]->address=i;
    }
    if( my_rank == 0 ){
      printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
      graphShow(dg,0);
      timer_clear(0);
      timer_start(0);
    }

    verified=ProcessNodes(dg,my_rank);
    TRACE_smpi_set_category ("end");
 
    featnum=NUM_SAMPLES*fielddim;
    bytes_sent=featnum*dg->numArcs;
    bytes_sent/=1048576;
    if(my_rank==0){
      timer_stop(0);
      tot_time=timer_read(0);
      c_print_results( dg->name,
                 CLASS,
                 featnum,
                 0,
                 0,
                 dg->numNodes,
                 0,
                 comm_size,
                 tot_time,
                 bytes_sent/tot_time,
                 "bytes transmitted", 
                 verified,
                 NPBVERSION,
                 COMPILETIME,
                 MPICC,
                 CLINK,
                 CMPI_LIB,
                 CMPI_INC,
                 CFLAGS,
                 CLINKFLAGS );
    }          
    MPI_Finalize();
  return 1;
}
Esempio n. 8
0
int main( int argc, char **argv )
{

  int             i, iteration;

  double          timecounter;

  FILE            *fp;

  cl_int ecode;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  /*  Initialize timers  */
  timer_on = 0;            
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    fclose(fp);
    timer_on = 1;
  }
  timer_clear( 0 );
  if (timer_on) {
    timer_clear( 1 );
    timer_clear( 2 );
    timer_clear( 3 );
  }

  if (timer_on) timer_start( 3 );

  /*  Initialize the verification arrays if a valid class */
  for( i=0; i<TEST_ARRAY_SIZE; i++ )
    switch( CLASS )
    {
      case 'S':
        test_index_array[i] = S_test_index_array[i];
        test_rank_array[i]  = S_test_rank_array[i];
        break;
      case 'A':
        test_index_array[i] = A_test_index_array[i];
        test_rank_array[i]  = A_test_rank_array[i];
        break;
      case 'W':
        test_index_array[i] = W_test_index_array[i];
        test_rank_array[i]  = W_test_rank_array[i];
        break;
      case 'B':
        test_index_array[i] = B_test_index_array[i];
        test_rank_array[i]  = B_test_rank_array[i];
        break;
      case 'C':
        test_index_array[i] = C_test_index_array[i];
        test_rank_array[i]  = C_test_rank_array[i];
        break;
      case 'D':
        test_index_array[i] = D_test_index_array[i];
        test_rank_array[i]  = D_test_rank_array[i];
        break;
    };

  /* set up the OpenCL environment. */
  setup_opencl(argc, argv);

  /*  Printout initial NPB info */
  printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" );
  printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
  printf( " Iterations:   %d\n", MAX_ITERATIONS );

  if (timer_on) timer_start( 1 );

  /*  Generate random number sequence and subsequent keys on all procs */
  create_seq( 314159265.00,                    /* Random number gen seed */
              1220703125.00 );                 /* Random number gen mult */
  if (timer_on) timer_stop( 1 );

  /*  Do one interation for free (i.e., untimed) to guarantee initialization of  
      all data and code pages and respective tables */
  rank( 1 );  

  /*  Start verification counter */
  passed_verification = 0;

  DTIMER_START(T_BUFFER_WRITE);
  ecode = clEnqueueWriteBuffer(cmd_queue,
                               m_passed_verification,
                               CL_TRUE,
                               0,
                               sizeof(cl_int),
                               &passed_verification,
                               0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_WRITE);

  if( CLASS != 'S' ) printf( "\n   iteration\n" );

  /*  Start timer  */             
  timer_start( 0 );


  /*  This is the main iteration */
  for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
  {
    if( CLASS != 'S' ) printf( "        %d\n", iteration );
    rank( iteration );
  }

  DTIMER_START(T_BUFFER_READ);
  ecode = clEnqueueReadBuffer(cmd_queue,
                              m_passed_verification,
                              CL_TRUE,
                              0,
                              sizeof(cl_int),
                              &passed_verification,
                              0, NULL, NULL);
  clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification");
  DTIMER_STOP(T_BUFFER_READ);

  /*  End of timing, obtain maximum time of all processors */
  timer_stop( 0 );
  timecounter = timer_read( 0 );


  /*  This tests that keys are in sequence: sorting of last ranked key seq
      occurs here, but is an untimed operation                             */
  if (timer_on) timer_start( 2 );
  full_verify();
  if (timer_on) timer_stop( 2 );

  if (timer_on) timer_stop( 3 );


  /*  The final printout  */
  if( passed_verification != 5*MAX_ITERATIONS + 1 )
    passed_verification = 0;
  c_print_results( "IS",
                   CLASS,
                   (int)(TOTAL_KEYS/64),
                   64,
                   0,
                   MAX_ITERATIONS,
                   timecounter,
                   ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                              /timecounter/1000000.,
                   "keys ranked", 
                   passed_verification,
                   NPBVERSION,
                   COMPILETIME,
                   CC,
                   CLINK,
                   C_LIB,
                   C_INC,
                   CFLAGS,
                   CLINKFLAGS,
                   "",
                   clu_GetDeviceTypeName(device_type),
                   device_name);

  /*  Print additional timers  */
  if (timer_on) {
    double t_total, t_percent;

    t_total = timer_read( 3 );
    printf("\nAdditional timers -\n");
    printf(" Total execution: %8.3f\n", t_total);
    if (t_total == 0.0) t_total = 1.0;
    timecounter = timer_read(1);
    t_percent = timecounter/t_total * 100.;
    printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(0);
    t_percent = timecounter/t_total * 100.;
    printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    timecounter = timer_read(2);
    t_percent = timecounter/t_total * 100.;
    printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
  }

  release_opencl();
  
  fflush(stdout);

  return 0;
  /**************************/
} /*  E N D  P R O G R A M  */
Esempio n. 9
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  cl_int ecode;
  char *source_dir = "IS";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timer_on) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 256) {
    max_work_group_size = 256;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 256) {
        work_item_sizes[i] = 256;
      }
    }
  }

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "is_cpu.cl";
    sprintf(build_option, "-DCLASS=%d -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK_GROUP_SIZE = 1;
    RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128;
    RANK1_GROUP_SIZE = 1;
    RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;;
    RANK2_GROUP_SIZE = RANK_GROUP_SIZE;
    RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;;
    FV2_GROUP_SIZE = 64;
    FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256;
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    source_file = "is_gpu.cl";
    sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS);

    CREATE_SEQ_GROUP_SIZE = 64;
    CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256;
    RANK1_GROUP_SIZE = work_item_sizes[0];
    RANK1_GLOBAL_SIZE = MAX_KEY;
    RANK2_GROUP_SIZE = work_item_sizes[0];
    RANK2_GLOBAL_SIZE = NUM_KEYS;
    FV2_GROUP_SIZE = work_item_sizes[0];
    FV2_GLOBAL_SIZE = NUM_KEYS;
  } else {
    fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type));
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_key_array = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_array");

  m_key_buff1 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * MAX_KEY,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1");

  m_key_buff2 = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(INT_TYPE) * SIZE_OF_BUFFERS,
                               NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2");

  size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE;
  m_index_array = clCreateBuffer(context,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 test_array_size,
                                 test_index_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_index_array");

  m_rank_array = clCreateBuffer(context,
                                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                test_array_size,
                                test_rank_array, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rank_array");

  m_partial_vals = clCreateBuffer(context,
                                  CL_MEM_WRITE_ONLY,
                                  test_array_size,
                                  NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals");

  m_passed_verification = clCreateBuffer(context,
                                         CL_MEM_READ_WRITE,
                                         sizeof(cl_int),
                                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    m_key_scan = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(INT_TYPE) * MAX_KEY,
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan");

    m_sum = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           sizeof(INT_TYPE) * work_item_sizes[0],
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_sum");
  } else {
    size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS;
    m_bucket_size = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size");

    m_bucket_ptrs = clCreateBuffer(context,
                                   CL_MEM_READ_WRITE,
                                   bs_size,
                                   NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs");
  }
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  k_rank0 = clCreateKernel(program, "rank0", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank0");
  ecode  = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array);
  ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array);
  clu_CheckError(ecode, "clSetKernelArg() for rank0");

  if (device_type == CL_DEVICE_TYPE_GPU) {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_0");
    ecode  = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_0, 3, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_0");

    k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_1");
    ecode  = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum);
    ecode  = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum);
    ecode |= clSetKernelArg(k_rank3_1, 2, 
                            sizeof(INT_TYPE) * work_item_sizes[0] * 2,
                            NULL);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_1");

    k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3_2");
    ecode  = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode  = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum);
    clu_CheckError(ecode, "clSetKernelArg() for rank3_2");
  } else {
    k_rank1 = clCreateKernel(program, "rank1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank1");
    ecode  = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    clu_CheckError(ecode, "clSetKernelArg() for rank1");

    k_rank2 = clCreateKernel(program, "rank2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank2");
    ecode  = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array);
    ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank2");

    k_rank3 = clCreateKernel(program, "rank3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rank3");
    ecode  = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size);
    ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs);
    ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1);
    ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2);
    clu_CheckError(ecode, "clSetKernelArg() for rank3");
  }

  k_rank4 = clCreateKernel(program, "rank4", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rank4");
  ecode  = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals);
  ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1);
  ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array);
  ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem),
                                      (void*)&m_passed_verification);
  clu_CheckError(ecode, "clSetKernelArg() for rank4");
  DTIMER_STOP(T_OPENCL_API);
}
Esempio n. 10
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
void setup_opencl(int argc, char *argv[])
{
  cl_int err_code;
  char *source_dir = "EP";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code);
  clu_CheckError(err_code, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &err_code);
  clu_CheckError(err_code, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[30];
  sprintf(build_option, "-DM=%d -I.", M);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ep_cpu.cl";
    GROUP_SIZE = 16;
  } else {
    source_file = "ep_gpu.cl";
    GROUP_SIZE = 64;
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);

  gq_size  = np / GROUP_SIZE * NQ * sizeof(double);
  gsx_size = np / GROUP_SIZE * sizeof(double);
  gsy_size = np / GROUP_SIZE * sizeof(double);

  pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgq");

  pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsx");

  pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code);
  clu_CheckError(err_code, "clCreateBuffer() for pgsy");

  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create a kernel
  DTIMER_START(T_OPENCL_API);
  kernel = clCreateKernel(program, "embar", &err_code);
  clu_CheckError(err_code, "clCreateKernel()");
  DTIMER_STOP(T_OPENCL_API);
}
Esempio n. 11
0
int
main (int argc, char **argv)
{
  //auto double *_ppthd_x;
  auto double Mops;
  auto double t1;
  auto double t2;
  auto double t3;
  auto double t4;
  auto double x1;
  auto double x2;
  auto double sx;
  auto double sy;
  auto double tm;
  auto double an;
  auto double tt;
  auto double gc;
  auto double dum[3];
  auto int np;
  auto int ierr;
  auto int node;
  auto int no_nodes;
  auto int i;
  auto int ik;
  auto int kk;
  auto int l;
  auto int k;
  auto int nit;
  auto int ierrcode;
  auto int no_large_nodes;
  auto int np_add;
  auto int k_offset;
  auto int j;
  auto int nthreads;
  auto int verified;
  auto char size[14];
 int status = 0;
  _ompc_init(argc,argv);

  //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x))));
  (*(dum)) = (1.0);
  (*((dum) + (1))) = (1.0);
  (*((dum) + (2))) = (1.0);
  (nthreads) = (1);
# 84 "ep.c"
  printf
    ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012");
# 86 "ep.c"
  sprintf (size, "%12.0f", pow (2.0, (28) + (1)));
# 87 "ep.c"
  for ((j) = (13); (j) >= (1); (j)--)
    {

# 88 "ep.c"
      if ((((int) (*((size) + (j))))) == (46))
	{
	  (*((size) + (j))) = (((char) (32)));
	}
    }
# 90 "ep.c"
  printf (" Number of random numbers generated: %13s\012", size);
# 92 "ep.c"
  (verified) = (0);
# 99 "ep.c"
  (np) = ((1) << ((28) - (16)));
# 107 "ep.c"
  vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2));
# 108 "ep.c"
  (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2))));
# 109 "ep.c"
  for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++)
    {
      x[i] = (-(1.0E99));
      //(*((_ppthd_x) + (i))) = (-(1.0E99));
    }
# 110 "ep.c"
  (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0)))));
# 112 "ep.c"
  timer_clear (1);
# 113 "ep.c"
  timer_clear (2);
# 114 "ep.c"
  timer_clear (3);
# 115 "ep.c"
  timer_start (1);
# 117 "ep.c"
  vranlc (0, &(t1), 1.220703125E9, x);
  //vranlc (0, &(t1), 1.220703125E9, _ppthd_x);
# 121 "ep.c"
  (t1) = (1.220703125E9);
# 123 "ep.c"
  for ((i) = (1); (i) <= ((16) + (1)); (i)++)
    {

# 124 "ep.c"
      (t2) = (randlc (&(t1), t1));
    }
# 127 "ep.c"
  (an) = (t1);
# 128 "ep.c"
  (tt) = (2.71828183E8);
# 129 "ep.c"
  (gc) = (0.0);
# 130 "ep.c"
  (sx) = (0.0);
# 131 "ep.c"
  (sy) = (0.0);
# 133 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 134 "ep.c"
      (*((q) + (i))) = (0.0);
    }
# 142 "ep.c"
  (k_offset) = (-(1));
  {
    auto void *__ompc_argv[6];
    (*(__ompc_argv)) = (((void *) (&sx)));
    (*((__ompc_argv) + (1))) = (((void *) (&sy)));
    (*((__ompc_argv) + (2))) = (((void *) (&np)));
    (*((__ompc_argv) + (3))) = (((void *) (&k_offset)));
    (*((__ompc_argv) + (4))) = (((void *) (&an)));
    (*((__ompc_argv) + (5))) = (((void *) (&nthreads)));
    _ompc_do_parallel (__ompc_func_3, __ompc_argv);
  }
# 207 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 208 "ep.c"
      (gc) = ((gc) + (*((q) + (i))));
    }
# 211 "ep.c"
  timer_stop (1);
# 212 "ep.c"
  (tm) = (timer_read (1));
# 214 "ep.c"
  (nit) = (0);
# 215 "ep.c"
  if ((28) == (24))
    {

# 216 "ep.c"
      if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8)))
	{

# 218 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 220 "ep.c"
  if ((28) == (25))
    {

# 221 "ep.c"
      if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8)))
	{

# 223 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 225 "ep.c"
  if ((28) == (28))
    {

# 226 "ep.c"
      if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8)))
	{

# 228 "ep.c"
	  (verified) = (1);
          printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy);
       }
     }

  else
# 230 "ep.c"
  if ((28) == (30))
    {

# 231 "ep.c"
      if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8)))
	{

# 233 "ep.c"
	  (verified) = (1);
	}
    }
  else
# 235 "ep.c"
  if ((28) == (32))
    {

# 236 "ep.c"
      if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8))
	  && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8)))
	{

# 238 "ep.c"
	  (verified) = (1);
	}
    }
# 242 "ep.c"
  (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0));
# 244 "ep.c"
  printf
    ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012",
     tm, 28, gc, sx, sy);
# 251 "ep.c"
  for ((i) = (0); (i) <= ((10) - (1)); (i)++)
    {

# 252 "ep.c"
      printf ("%3d %15.0f\012", i, *((q) + (i)));
    }
# 255 "ep.c"
  c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops,
		   "Random numbers generated", verified, "2.3", "07 Aug 2006",
		   "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm",
		   "randdp");
# 261 "ep.c"
  if ((0) == (1))
    {

# 262 "ep.c"
      printf ("Total time:     %f", timer_read (1));
# 263 "ep.c"
      printf ("Gaussian pairs: %f", timer_read (2));
# 264 "ep.c"
      printf ("Random numbers: %f", timer_read (3));
    }
}
Esempio n. 12
0
int main(int argc, char *argv[]) 
{
  double Mops, t1, t2;
  double tsx, tsy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    i, nit;
  int    k_offset, j;
  logical verified;

  char   size[16];

  FILE *fp;

  if (argc == 1) {
    fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
    exit(-1);
  }

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  setup_opencl(argc, argv);

  timer_clear(0);
  timer_start(0);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  DTIMER_START(T_KERNEL_EMBAR);

  // Launch the kernel
  int q_size  = GROUP_SIZE * NQ * sizeof(cl_double);
  int sx_size = GROUP_SIZE * sizeof(cl_double);
  int sy_size = GROUP_SIZE * sizeof(cl_double);
  err_code  = clSetKernelArg(kernel, 0, q_size, NULL);
  err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
  err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
  err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
  err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
  err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
  err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
  err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
  clu_CheckError(err_code, "clSetKernelArg()");
  
  size_t localWorkSize[] = { GROUP_SIZE };
  size_t globalWorkSize[] = { np };
  err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
                                    globalWorkSize, 
                                    localWorkSize,
                                    0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
  CHECK_FINISH();
  DTIMER_STOP(T_KERNEL_EMBAR);

  double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size);
  double *gsx = (double*)malloc(gsx_size);
  double *gsy = (double*)malloc(gsy_size);

  gc  = 0.0;
  tsx = 0.0;
  tsy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  // 9. Get the result
  DTIMER_START(T_BUFFER_READ);
  err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, 
                                 gq, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, 
                                 gsx, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");

  err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, 
                                 gsy, 0, NULL, NULL);
  clu_CheckError(err_code, "clEnqueueReadbuffer()");
  DTIMER_STOP(T_BUFFER_READ);

  for (i = 0; i < np/localWorkSize[0]; i++) {
    for (j = 0; j < NQ; j++ ){
      q[j] = q[j] + gq[i][j];
    }
    tsx = tsx + gsx[i];
    tsy = tsy + gsy[i];
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((tsx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((tsy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", tsx, tsy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  c_print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, 
      CS1, CS2, CS3, CS4, CS5, CS6, CS7,
      clu_GetDeviceTypeName(device_type), device_name);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  free(gq);
  free(gsx);
  free(gsy);
  release_opencl();

  fflush(stdout);

  return 0;
}
Esempio n. 13
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode,
	no_large_nodes, np_add, k_offset, j;
    int nthreads = 1;
    boolean verified;
    char size[13+1];	/* character*13 */

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/
#ifndef POSIX
#ifndef NOBOMP
    bomp_custom_init();
#endif
#endif
    omp_set_num_threads(1);
    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);

    for (i = 0; i < 2*NK; i++) 
    {
	x[i] = -1.0e99;
    }        

    printf("Reached here ");
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
	t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;

    for ( i = 0; i <= NQ - 1; i++) {
	q[i] = 0.0;
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

#pragma omp parallel copyin(x)
{
    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double qq[NQ];		/* private copy of q[0:NQ-1] */

    for (i = 0; i < NQ; i++) qq[i] = 0.0;

#pragma omp for reduction(+:sx,sy) schedule(static)  
    for (k = 1; k <= np; k++) {
	kk = k_offset + k;
	t1 = S;
	t2 = an;

/*      Find starting seed t1 for this kk. */

	for (i = 1; i <= 100; i++) {
            ik = kk / 2;
            if (2 * ik != kk) t3 = randlc(&t1, t2);
            if (ik == 0) break;
            t3 = randlc(&t2, t2);
            kk = ik;
	}

/*      Compute uniform pseudorandom numbers. */

	if (TIMERS_ENABLED == TRUE) timer_start(3);
	vranlc(2*NK, &t1, A, x-1);
	if (TIMERS_ENABLED == TRUE) timer_stop(3);

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
	if (TIMERS_ENABLED == TRUE) timer_start(2);

	for ( i = 0; i < NK; i++) {
            x1 = 2.0 * x[2*i] - 1.0;
            x2 = 2.0 * x[2*i+1] - 1.0;
            t1 = pow2(x1) + pow2(x2);
            if (t1 <= 1.0) {
		t2 = sqrt(-2.0 * log(t1) / t1);
		t3 = (x1 * t2);				/* Xi */
		t4 = (x2 * t2);				/* Yi */
		l = max(fabs(t3), fabs(t4));
		qq[l] += 1.0;				/* counts */
		sx = sx + t3;				/* sum of Xi */
		sy = sy + t4;				/* sum of Yi */
            }
	}
	if (TIMERS_ENABLED == TRUE) timer_stop(2);
    }
#pragma omp critical
    {
      for (i = 0; i <= NQ - 1; i++) q[i] += qq[i];
    }
#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */    
} /* end of parallel region */    

    for (i = 0; i <= NQ-1; i++) {
        gc = gc + q[i];
    }

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads,
		  tm, Mops, 	
		  "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    if (TIMERS_ENABLED == TRUE) {
	printf("Total time:     %f", timer_read(1));
	printf("Gaussian pairs: %f", timer_read(2));
	printf("Random numbers: %f", timer_read(3));
    }
}
Esempio n. 14
0
void appft(int niter, double *total_time, logical *verified)
{
  int i, j, k, kt, n12, n22, n32, ii, jj, kk, ii2, ik2;
  double ap;

  dcomplex exp1[NX], exp2[NY], exp3[NZ];

  for (i = 1; i <= 15; i++) {
    timer_clear(i);
  }         

  timer_start(2);      
  compute_initial_conditions(NX, NY, NZ, xnt);

  CompExp(NX, exp1);
  CompExp(NY, exp2);
  CompExp(NZ, exp3);          
  fftXYZ(1, NX, NY, NZ, xnt, (dcomplex *)y, exp1, exp2, exp3);
  timer_stop(2);

  timer_start(1);
  if (timers_enabled) timer_start(13);

  n12 = NX / 2;
  n22 = NY / 2;
  n32 = NZ / 2;
  ap = -4.0 * ALPHA * (PI * PI);
  for (i = 0; i < NZ; i++) {
    ii = i - (i / n32) * NZ;
    ii2 = ii * ii;
    for (k = 0; k < NY; k++) {
      kk = k - (k / n22) * NY;
      ik2 = ii2 + kk*kk;
      for (j = 0; j < NX; j++) {
        jj = j - (j / n12) * NX;
        twiddle[i][k][j] = exp(ap*(double)(jj*jj + ik2));
      }
    }
  }
  if (timers_enabled) timer_stop(13);

  if (timers_enabled) timer_start(12);
  compute_initial_conditions(NX, NY, NZ, xnt);
  if (timers_enabled) timer_stop(12);
  if (timers_enabled) timer_start(15);
  fftXYZ(1, NX, NY, NZ, xnt, (dcomplex *)y, exp1, exp2, exp3);
  if (timers_enabled) timer_stop(15);

  for (kt = 1; kt <= niter; kt++) {
    if (timers_enabled) timer_start(11);
    evolve(NX, NY, NZ, xnt, y, twiddle);
    if (timers_enabled) timer_stop(11);
    if (timers_enabled) timer_start(15);
    fftXYZ(-1, NX, NY, NZ, xnt, (dcomplex *)xnt, exp1, exp2, exp3);
    if (timers_enabled) timer_stop(15);
    if (timers_enabled) timer_start(10);
    CalculateChecksum(&sums[kt], kt, NX, NY, NZ, xnt);
    if (timers_enabled) timer_stop(10);
  }

  // Verification test.
  if (timers_enabled) timer_start(14);
  verify(NX, NY, NZ, niter, sums, verified);
  if (timers_enabled) timer_stop(14);
  timer_stop(1);

  *total_time = timer_read(1);
  if (!timers_enabled) return;

  printf(" FT subroutine timers \n");
  printf(" %26s =%9.4f\n", "FT total                  ", timer_read(1));
  printf(" %26s =%9.4f\n", "WarmUp time               ", timer_read(2));
  printf(" %26s =%9.4f\n", "fftXYZ body               ", timer_read(3));
  printf(" %26s =%9.4f\n", "Swarztrauber              ", timer_read(4));
  printf(" %26s =%9.4f\n", "X time                    ", timer_read(7));
  printf(" %26s =%9.4f\n", "Y time                    ", timer_read(8));
  printf(" %26s =%9.4f\n", "Z time                    ", timer_read(9));
  printf(" %26s =%9.4f\n", "CalculateChecksum         ", timer_read(10));
  printf(" %26s =%9.4f\n", "evolve                    ", timer_read(11));
  printf(" %26s =%9.4f\n", "compute_initial_conditions", timer_read(12));
  printf(" %26s =%9.4f\n", "twiddle                   ", timer_read(13));
  printf(" %26s =%9.4f\n", "verify                    ", timer_read(14));
  printf(" %26s =%9.4f\n", "fftXYZ                    ", timer_read(15));
  printf(" %26s =%9.4f\n", "Benchmark time            ", *total_time);
}
Esempio n. 15
0
int main(int argc, char *argv[])
{
  int i, niter, step;
  double navg, mflops, n3;

  double tmax, t, trecs[t_last+1];
  logical verified;
  char Class;
  char *t_names[t_last+1];

  //---------------------------------------------------------------------
  // Root node reads input file (if it exists) else takes
  // defaults from parameters
  //---------------------------------------------------------------------
  FILE *fp;
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    timeron = true;
    t_names[t_total] = "total";
    t_names[t_rhsx] = "rhsx";
    t_names[t_rhsy] = "rhsy";
    t_names[t_rhsz] = "rhsz";
    t_names[t_rhs] = "rhs";
    t_names[t_xsolve] = "xsolve";
    t_names[t_ysolve] = "ysolve";
    t_names[t_zsolve] = "zsolve";
    t_names[t_rdis1] = "redist1";
    t_names[t_rdis2] = "redist2";
    t_names[t_add] = "add";
    fclose(fp);
  } else {
    timeron = false;
  }

  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OMP-C) - BT Benchmark\n\n");

  if ((fp = fopen("inputbt.data", "r")) != NULL) {
    int result;
    printf(" Reading from input file inputbt.data\n");
    result = fscanf(fp, "%d", &niter);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%lf", &dt);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d%d%d\n", 
        &grid_points[0], &grid_points[1], &grid_points[2]);
    fclose(fp);
  } else {
    printf(" No input file inputbt.data. Using compiled defaults\n");
    niter = NITER_DEFAULT;
    dt    = DT_DEFAULT;
    grid_points[0] = PROBLEM_SIZE;
    grid_points[1] = PROBLEM_SIZE;
    grid_points[2] = PROBLEM_SIZE;
  }

  printf(" Size: %4dx%4dx%4d\n",
      grid_points[0], grid_points[1], grid_points[2]);
  printf(" Iterations: %4d       dt: %11.7f\n", niter, dt);
  printf(" Number of available threads: %5d\n", omp_get_max_threads());
  printf("\n");

  if ( (grid_points[0] > IMAX) ||
       (grid_points[1] > JMAX) ||
       (grid_points[2] > KMAX) ) {
    printf(" %d, %d, %d\n", grid_points[0], grid_points[1], grid_points[2]);
    printf(" Problem size too big for compiled array sizes\n");
    return 0;
  }

  set_constants();

  for (i = 1; i <= t_last; i++) {
    timer_clear(i);
  }

  initialize();

  exact_rhs();

  //---------------------------------------------------------------------
  // do one time step to touch all code, and reinitialize
  //---------------------------------------------------------------------
  adi();
  initialize();

  for (i = 1; i <= t_last; i++) {
    timer_clear(i);
  }
  timer_start(1);

// Do not do inlining, to avoid huge loops, scops are kept separate and are
// distributed among files.
//#pragma scop
  for (step = 1; step <= niter; step++) {
    adi();
  }
//#pragma endscop

  timer_stop(1);
  tmax = timer_read(1);

  verify(niter, &Class, &verified);

  n3 = 1.0*grid_points[0]*grid_points[1]*grid_points[2];
  navg = (grid_points[0]+grid_points[1]+grid_points[2])/3.0;
  if(tmax != 0.0) {
    mflops = 1.0e-6 * (double)niter *
      (3478.8 * n3 - 17655.7 * (navg*navg) + 28023.7 * navg)
      / tmax;
  } else {
    mflops = 0.0;
  }
  print_results("BT", Class, grid_points[0], 
                grid_points[1], grid_points[2], niter,
                tmax, mflops, "          floating point", 
                verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
                CS6, "(none)");

  //---------------------------------------------------------------------
  // More timers
  //---------------------------------------------------------------------
  if (timeron) {
    for (i = 1; i <= t_last; i++) {
      trecs[i] = timer_read(i);
    }
    if (tmax == 0.0) tmax = 1.0;

    printf("  SECTION   Time (secs)\n");
    for (i = 1; i <= t_last; i++) {
      printf("  %-8s:%9.3f  (%6.2f%%)\n", 
          t_names[i], trecs[i], trecs[i]*100./tmax);
      if (i == t_rhs) {
        t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz];
        printf("    --> %8s:%9.3f  (%6.2f%%)\n", "sub-rhs", t, t*100./tmax);
        t = trecs[t_rhs] - t;
        printf("    --> %8s:%9.3f  (%6.2f%%)\n", "rest-rhs", t, t*100./tmax);
      } else if (i==t_zsolve) {
        t = trecs[t_zsolve] - trecs[t_rdis1] - trecs[t_rdis2];
        printf("    --> %8s:%9.3f  (%6.2f%%)\n", "sub-zsol", t, t*100./tmax);
      } else if (i==t_rdis2) {
        t = trecs[t_rdis1] + trecs[t_rdis2];
        printf("    --> %8s:%9.3f  (%6.2f%%)\n", "redist", t, t*100./tmax);
      }
    }
  }

  return 0;
}
Esempio n. 16
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  int i;
  size_t temp, wg_num;
  cl_int ecode;
  char *source_dir = "LU";

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  if (argc > 1) source_dir = argv[1];

  //-----------------------------------------------------------------------
  // 1. Find the default device type and get a device for the device type
  //-----------------------------------------------------------------------
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ////////////////////////////////////////////////////////////////////////
  // FIXME: The below values are experimental.
  size_t default_wg_size = 64;
  if (device_type == CL_DEVICE_TYPE_CPU) {
    if (CLASS == 'B') default_wg_size = 128;
  } else {
    if (CLASS == 'B') default_wg_size = 32;
  }
  if (max_work_group_size > default_wg_size) {
    max_work_group_size = default_wg_size;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > default_wg_size) {
        work_item_sizes[i] = default_wg_size;
      }
    }
  }
  if (device_type == CL_DEVICE_TYPE_CPU) {
    SETBV1_DIM = SETBV1_DIM_CPU;
    SETBV2_DIM = SETBV2_DIM_CPU;
    SETBV3_DIM = SETBV3_DIM_CPU;
    SETIV_DIM = SETIV_DIM_CPU;
    ERHS1_DIM = ERHS1_DIM_CPU;
    ERHS2_DIM = ERHS2_DIM_CPU;
    ERHS3_DIM = ERHS3_DIM_CPU;
    ERHS4_DIM = ERHS4_DIM_CPU;
    PINTGR1_DIM = PINTGR1_DIM_CPU;
    PINTGR2_DIM = PINTGR2_DIM_CPU;
    PINTGR3_DIM = PINTGR3_DIM_CPU;
    RHS_DIM  = RHS_DIM_CPU;
    RHSX_DIM = RHSX_DIM_CPU;
    RHSY_DIM = RHSY_DIM_CPU;
    RHSZ_DIM = RHSZ_DIM_CPU;
    SSOR2_DIM = SSOR2_DIM_CPU;
    SSOR3_DIM = SSOR3_DIM_CPU;
  } else {
    SETBV1_DIM = SETBV1_DIM_GPU;
    SETBV2_DIM = SETBV2_DIM_GPU;
    SETBV3_DIM = SETBV3_DIM_GPU;
    SETIV_DIM = SETIV_DIM_GPU;
    ERHS1_DIM = ERHS1_DIM_GPU;
    ERHS2_DIM = ERHS2_DIM_GPU;
    ERHS3_DIM = ERHS3_DIM_GPU;
    ERHS4_DIM = ERHS4_DIM_GPU;
    PINTGR1_DIM = PINTGR1_DIM_GPU;
    PINTGR2_DIM = PINTGR2_DIM_GPU;
    PINTGR3_DIM = PINTGR3_DIM_GPU;
    RHS_DIM  = RHS_DIM_GPU;
    RHSX_DIM = RHSX_DIM_GPU;
    RHSY_DIM = RHSY_DIM_GPU;
    RHSZ_DIM = RHSZ_DIM_GPU;
    SSOR2_DIM = SSOR2_DIM_GPU;
    SSOR3_DIM = SSOR3_DIM_GPU;
  }
  ////////////////////////////////////////////////////////////////////////

  //-----------------------------------------------------------------------
  // 2. Create a context for the specified device
  //-----------------------------------------------------------------------
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  //-----------------------------------------------------------------------
  // 3. Create command queues
  //-----------------------------------------------------------------------
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units;
  pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline);
  for (i = 0; i < max_pipeline; i++) {
    pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  //-----------------------------------------------------------------------
  // 4. Build programs
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUILD);
  char build_option[100];

  if (device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);
  } else {
    sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS);
  }

  p_pre = clu_MakeProgram(context, device, source_dir,
                          "kernel_pre.cl",
                          build_option);

  p_main = clu_MakeProgram(context, device, source_dir,
                          (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"),
                          build_option);

  p_post = clu_MakeProgram(context, device, source_dir,
                          "kernel_post.cl",
                          build_option);
  if (timeron) timer_stop(TIMER_BUILD);

  //-----------------------------------------------------------------------
  // 5. Create buffers
  //-----------------------------------------------------------------------
  if (timeron) timer_start(TIMER_BUFFER);
  m_ce = clCreateBuffer(context,
                        CL_MEM_READ_ONLY,
                        sizeof(double)*5*13,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_ce");

  m_u = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_rsd = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rsd");

  m_frct = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_frct");

  m_qs = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_qs");

  m_rho_i = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1),
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

  // workspace for work-items
  size_t max_work_items;
  if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) {
    max_work_items = ISIZ3;
  } else {
    max_work_items = ISIZ3*ISIZ2;
  }
  m_flux = clCreateBuffer(context,
                       CL_MEM_READ_WRITE,
                       sizeof(double)*ISIZ1*5 * max_work_items,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_flux");

  if (RHSZ_DIM == 1) {
    max_work_items = ISIZ2;
  } else {
    max_work_items = ISIZ2*ISIZ1;
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    m_utmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*6 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_utmp");

    m_rtmp = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(double)*ISIZ3*5 * max_work_items,
                         NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rtmp");
  }

  temp = (nz0-2) / max_compute_units;
  l2norm_lws[0] = temp == 0 ? 1 : temp;
  l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]);
  wg_num = l2norm_gws[0] / l2norm_lws[0];
  sum_size = sizeof(double) * 5 * wg_num;
  m_sum = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sum_size, 
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer()");

  if (timeron) timer_stop(TIMER_BUFFER);

  //-----------------------------------------------------------------------
  // 6. Create kernels
  //-----------------------------------------------------------------------
  k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv1");
  ecode  = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV1_DIM == 3) {
    setbv1_lws[0] = 5;
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv1_lws[1];
    setbv1_lws[2] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]);
    setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]);
  } else if (SETBV1_DIM == 2) {
    setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv1_lws[0];
    setbv1_lws[1] = ny < temp ? ny : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]);
    setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]);
  } else {
    temp = ny / max_compute_units;
    setbv1_lws[0] = temp == 0 ? 1 : temp;
    setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]);
  }

  k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv2");
  ecode  = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV2_DIM == 3) {
    setbv2_lws[0] = 5;
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nx < temp ? nx : temp;
    temp = temp / setbv2_lws[1];
    setbv2_lws[2] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]);
    setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]);
  } else if (SETBV2_DIM == 2) {
    setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / setbv2_lws[0];
    setbv2_lws[1] = nz < temp ? nz : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]);
    setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv2_lws[0] = temp == 0 ? 1 : temp;
    setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]);
  }

  k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setbv3");
  ecode  = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETBV3_DIM == 3) {
    setbv3_lws[0] = 5;
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = ny < temp ? ny : temp;
    temp = temp / setbv3_lws[1];
    setbv3_lws[2] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]);
    setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]);
  } else if (SETBV3_DIM == 2) {
    setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / setbv3_lws[0];
    setbv3_lws[1] = nz < temp ? nz : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]);
    setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]);
  } else {
    temp = nz / max_compute_units;
    setbv3_lws[0] = temp == 0 ? 1 : temp;
    setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]);
  }

  k_setiv = clCreateKernel(p_pre, "setiv", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for setiv");
  ecode  = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce);
  ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SETIV_DIM == 3) {
    setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp;
    temp = temp / setiv_lws[1];
    setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]);
    setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]);
  } else if (SETIV_DIM == 2) {
    setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0];
    temp = max_work_group_size / setiv_lws[0];
    setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]);
    setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]);
  } else {
    temp = (nz-2) / max_compute_units;
    setiv_lws[0] = temp == 0 ? 1 : temp;
    setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]);
  }

  k_l2norm = clCreateKernel(p_main, "l2norm", &ecode);
  clu_CheckError(ecode, "clCreateKernel()");
  ecode  = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum);
  ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL);
  clu_CheckError(ecode, "clSetKernelArg()");

  k_rhs = clCreateKernel(p_main, "rhs", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhs");
  ecode  = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct);
  ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHS_DIM == 3) {
    rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = ny < temp ? ny : temp;
    temp = temp / rhs_lws[1];
    rhs_lws[2] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]);
    rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]);
  } else if (RHS_DIM == 2) {
    rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0];
    temp = max_work_group_size / rhs_lws[0];
    rhs_lws[1] = nz < temp ? nz : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]);
    rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]);
  } else {
    //temp = nz / max_compute_units;
    temp = 1;
    rhs_lws[0] = temp == 0 ? 1 : temp;
    rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]);
  }

  k_rhsx = clCreateKernel(p_main, "rhsx", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsx");
  ecode  = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSX_DIM == 2) {
    rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / rhsx_lws[0];
    rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]);
    rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsx_lws[0] = temp == 0 ? 1 : temp;
    rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]);
  }

  k_rhsy = clCreateKernel(p_main, "rhsy", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsy");
  ecode  = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSY_DIM == 2) {
    rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsy_lws[0];
    rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]);
    rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    rhsy_lws[0] = temp == 0 ? 1 : temp;
    rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]);
  }

  k_rhsz = clCreateKernel(p_main, "rhsz", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for rhsz");
  ecode  = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp);
    ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz);
  } else {
    ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx);
    ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny);
    ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz);
  }
  clu_CheckError(ecode, "clSetKernelArg()");
  if (RHSZ_DIM == 2) {
    rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / rhsz_lws[0];
    rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]);
    rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]);
  } else {
    //temp = (jend-jst) / max_compute_units;
    temp = 1;
    rhsz_lws[0] = temp == 0 ? 1 : temp;
    rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]);
  }

  k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor2");
  ecode  = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR2_DIM == 3) {
    ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor2_lws[1];
    ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]);
    ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]);
  } else if (SSOR2_DIM == 2) {
    ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor2_lws[0];
    ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]);
    ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor2_lws[0] = temp == 0 ? 1 : temp;
    ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]);
  }

  k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for ssor3");
  ecode  = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx);
  ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz);
  clu_CheckError(ecode, "clSetKernelArg()");
  if (SSOR3_DIM == 3) {
    ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp;
    temp = temp / ssor3_lws[1];
    ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]);
    ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]);
  } else if (SSOR3_DIM == 2) {
    ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
    temp = max_work_group_size / ssor3_lws[0];
    ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]);
    ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]);
  } else {
    //temp = (nz-2) / max_compute_units;
    temp = 1;
    ssor3_lws[0] = temp == 0 ? 1 : temp;
    ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]);
  }

  k_blts = clCreateKernel(p_main, "blts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for blts");
  ecode  = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / blts_lws[0];
  blts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]);
  blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]);

  k_buts = clCreateKernel(p_main, "buts", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for buts");
  ecode  = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd);
  ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u);
  ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs);
  ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i);
  ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz);
  ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny);
  ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx);
  clu_CheckError(ecode, "clSetKernelArg()");
  buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0];
  temp = max_work_group_size / buts_lws[0];
  buts_lws[1] = (nz-2) < temp ? (nz-2) : temp;
  buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]);
  buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]);

  if (timeron) timer_stop(TIMER_OPENCL);
}
Esempio n. 17
0
/***** Task Led(Toggle) *****/
void ledTask() {
	static uint8 mode = 0;
	if (timer_isfired(ON_WTD_TIMER_ID)) {
		wdt_reset();

		MDP_NRFSendDimmingReqToMDP(1, 1);
		timer_clear(ON_WTD_TIMER_ID);
		timer_set(ON_WTD_TIMER_ID, 100);
	}

	if (timer_isfired(ON_TEST_TIMER_ID)) {
		if (rotary_GetValue() == 0x00) {
			if (mode == 0) {
				mode++;
				lastSendDimmingLevel[0] = 0x34;
				lastSendDimmingLevel[1] = 0x34;
				lastSendDimmingLevel[2] = 0x34;
				lastSendDimmingLevel[3] = 0x34;
				MDP_SendDimmingReqToMDP(1, 1);
//				MDP_TESTSendDimmingReqToMDP(0xff, 0x33);
			} else if (mode == 1) {
				mode++;
				lastSendDimmingLevel[0] = 0x67;
				lastSendDimmingLevel[1] = 0x67;
				lastSendDimmingLevel[2] = 0x67;
				lastSendDimmingLevel[3] = 0x67;
				MDP_SendDimmingReqToMDP(1, 1);
//				MDP_TESTSendDimmingReqToMDP(0xff, 0x66);
			} else if (mode == 2) {
				mode++;
				lastSendDimmingLevel[0] = 0x9A;
				lastSendDimmingLevel[1] = 0x9A;
				lastSendDimmingLevel[2] = 0x9A;
				lastSendDimmingLevel[3] = 0x9A;
				MDP_SendDimmingReqToMDP(1, 1);
//				MDP_TESTSendDimmingReqToMDP(0xff, 0x99);
			} else if (mode == 3) {
				mode++;
				lastSendDimmingLevel[0] = 0xCD;
				lastSendDimmingLevel[1] = 0xCD;
				lastSendDimmingLevel[2] = 0xCD;
				lastSendDimmingLevel[3] = 0xCD;
				MDP_SendDimmingReqToMDP(1, 1);
//				MDP_TESTSendDimmingReqToMDP(0xff, 0xcc);
			} else if (mode == 4) {
				mode = 0;
				lastSendDimmingLevel[0] = 0xFE;
				lastSendDimmingLevel[1] = 0xFE;
				lastSendDimmingLevel[2] = 0xFE;
				lastSendDimmingLevel[3] = 0xFE;
				MDP_SendDimmingReqToMDP(1, 1);
//				MDP_TESTSendDimmingReqToMDP(0xff, 0xFE);
			} else {
				mode = 0;
			}
//			MDP_SendSetWatchdogReqToMDP(0);
		} else {
			MDP_SendDimmingReqToMDP(1, 1);

		}

		timer_clear(ON_TEST_TIMER_ID);
		timer_set(ON_TEST_TIMER_ID, 3000);
	}
}
Esempio n. 18
0
/*
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
*/
int main(int argc, char **argv) {

    double *x, **xx, *q, **qq;

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    const int TRANSFER_X = 1;
    int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode,
    no_large_nodes, np_add, k_offset, j;
    double loc_x,loc_t1,loc_t2,loc_t3,loc_t4;
    double loc_a1,loc_a2,loc_x1,loc_x2,loc_z;
    boolean verified;
    char size[13+1];	/* character*13 */
    
/*     Allocate working memory       */

    x = (double*) malloc(sizeof(double) * 2*NK);
    xx = (double**) malloc(sizeof(double*) * NN);
    xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK);
    for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK);
    q = (double*) malloc(sizeof(double) * NQ);
    qq = (double**) malloc(sizeof(double*) * NN);
    qq[0] = (double*) malloc(sizeof(double) * NN * NQ);
    for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ;

/*
c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)
*/

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    }
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

/*
c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
*/
    np = NN;

/*
c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
*/
#pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \
    copyout(q[0:NQ])
{
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    Mops = log(sqrt(fabs(max(1.0, 1.0))));

    timer_clear(1);
    timer_clear(2);
    timer_clear(3);
    timer_start(1);

    vranlc(0, &t1, A, x);
    #pragma acc update device(x[0:2*NK])

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
      t2 = randlc(&t1, t1);
    }

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;
    
    #pragma acc parallel loop
    for (k = 0; k < np; k++) {
      /* Initialize private q (qq) */
      #pragma acc loop
      for (i = 0; i < NQ; i++)
          qq[k][i] = 0.0;
      /* Initialize private x (xx)  */
      #pragma acc loop
      for (i = 0; i < 2*NK; i++)
          xx[k][i] = x[i];
    }
      
/*
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
*/
    k_offset = -1;

    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double psx, psy;

    #pragma acc parallel loop reduction(+:sx,sy)
    for (k = 1; k <= np; k++) {
      kk = k_offset + k;
      t1 = S;
      t2 = an;

/*      Find starting seed t1 for this kk. */

      #pragma acc loop seq
      for (i = 1; i <= 100; i++) {
          ik = kk / 2;
          if (2 * ik != kk) t3 = RANDLC(&t1, t2);
          if (ik == 0) break;
          t3 = RANDLC(&t2, t2);
          kk = ik;
      }

/*      Compute uniform pseudorandom numbers. */

      loc_t1 = r23 * A;
      loc_a1 = (int)loc_t1;
      loc_a2 = A - t23 * loc_a1;
      loc_x = t1;

      #pragma acc loop seq
      for (i = 1; i <= 2*NK; i++) {
          loc_t1 = r23 * loc_x;
          loc_x1 = (int)loc_t1;
          loc_x2 = loc_x - t23 * loc_x1;
          loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1;
          loc_t2 = (int)(r23 * loc_t1);
          loc_z = loc_t1 - t23 * loc_t2;
          loc_t3 = t23 * loc_z + loc_a2 * loc_x2;
          loc_t4 = (int)(r46 * loc_t3);
          loc_x = loc_t3 - t46 * loc_t4;
          xx[k-1][i-1] = r46 * loc_x;
      }
      t1 = loc_x;

/*
c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
*/
 
      psx = psy = 0.0;

      #pragma acc loop reduction(+:psx,psy)
      for ( i = 0; i < NK; i++) {
          x1 = 2.0 * xx[k-1][2*i] - 1.0;
          x2 = 2.0 * xx[k-1][2*i+1] - 1.0;
          t1 = pow2(x1) + pow2(x2);
          if (t1 <= 1.0) {
            t2 = sqrt(-2.0 * log(t1) / t1);
            t3 = (x1 * t2);             /* Xi */
            t4 = (x2 * t2);             /* Yi */
            l = max(fabs(t3), fabs(t4));
            qq[k-1][l] += 1.0;                      /* counts */
            psx = psx + t3;  /* sum of Xi */
            psy = psy + t4;               /* sum of Yi */
          }
      }

      sx += psx;
      sy += psy;
      
    }
    
/*      Reduce private qq to q          */
    #pragma acc parallel loop reduction(+:gc)
    for ( i = 0; i < NQ; i++ ) {
      double sumq = 0.0;
      #pragma acc loop reduction(+:sumq)
      for (k = 0; k < np; k++)
          sumq = sumq + qq[k][i];
      q[i] = sumq;
      gc += sumq;
    }

} /* end acc data */

    timer_stop(1);
    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;
	}
    }

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   "Counts:\n",
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    }
	  
    c_print_results("EP", CLASS, M+1, 0, 0, nit,
          tm, Mops, "Random numbers generated",
		  verified, NPBVERSION, COMPILETIME,
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    return 0;
}
Esempio n. 19
0
int main( int argc, char **argv )
{

    int             i, iteration, timer_on;

    double          timecounter;

    FILE            *fp;


/*  Initialize timers  */
    timer_on = 0;            
    if ((fp = fopen("timer.flag", "r")) != NULL) {
        fclose(fp);
        timer_on = 1;
    }
    timer_clear( 0 );
    if (timer_on) {
        timer_clear( 1 );
        timer_clear( 2 );
        timer_clear( 3 );
    }

    if (timer_on) timer_start( 3 );


/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
                break;
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
                break;
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
                break;
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
                break;
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];
                break;
            case 'D':
                test_index_array[i] = D_test_index_array[i];
                test_rank_array[i]  = D_test_rank_array[i];
                break;
        };

        

/*  Printout initial NPB info */
    printf
      ( "\n\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\n\n" );
    printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
    printf( " Iterations:   %d\n", MAX_ITERATIONS );

    if (timer_on) timer_start( 1 );

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq( 314159265.00,                    /* Random number gen seed */
                1220703125.00 );                 /* Random number gen mult */
    if (timer_on) timer_stop( 1 );


/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
    all data and code pages and respective tables */
    rank( 1 );  

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */             
    timer_start( 0 );


/*  This is the main iteration */
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
        if( CLASS != 'S' ) printf( "        %d\n", iteration );
        rank( iteration );
    }


/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );


/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */
    if (timer_on) timer_start( 2 );
    full_verify();
    if (timer_on) timer_stop( 2 );

    if (timer_on) timer_stop( 3 );


/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     CLASS,
                     (int)(TOTAL_KEYS/64),
                     64,
                     0,
                     MAX_ITERATIONS,
                     timecounter,
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                                                  /timecounter/1000000.,
                     "keys ranked", 
                     passed_verification,
                     NPBVERSION,
                     COMPILETIME,
                     CC,
                     CLINK,
                     C_LIB,
                     C_INC,
                     CFLAGS,
                     CLINKFLAGS );


/*  Print additional timers  */
    if (timer_on) {
       double t_total, t_percent;

       t_total = timer_read( 3 );
       printf("\nAdditional timers -\n");
       printf(" Total execution: %8.3f\n", t_total);
       if (t_total == 0.0) t_total = 1.0;
       timecounter = timer_read(1);
       t_percent = timecounter/t_total * 100.;
       printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(0);
       t_percent = timecounter/t_total * 100.;
       printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(2);
       t_percent = timecounter/t_total * 100.;
       printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    }


    return 0;
         /**************************/
}        /*  E N D  P R O G R A M  */
Esempio n. 20
0
int main() 
{
  double Mops, t1, t2, t3, t4, x1, x2;
  double sx, sy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    np;
  int    i, ik, kk, l, k, nit;
  int    k_offset, j;
  logical verified, timers_enabled;

  double dum[3] = {1.0, 1.0, 1.0};
  char   size[16];

  FILE *fp;

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;
    fclose(fp);
  }

  //--------------------------------------------------------------------
  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)
  //--------------------------------------------------------------------

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //--------------------------------------------------------------------
  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number
  //--------------------------------------------------------------------

  np = NN; 

  //--------------------------------------------------------------------
  //  Call the random number generator functions and initialize
  //  the x-array to reduce the effects of paging on the timings.
  //  Also, call all mathematical functions that are used. Make
  //  sure these initializations cannot be eliminated as dead code.
  //--------------------------------------------------------------------

  vranlc(0, &dum[0], dum[1], &dum[2]);
  dum[0] = randlc(&dum[1], dum[2]);
  for (i = 0; i < 2 * NK; i++) {
    x[i] = -1.0e99;
  }
  Mops = log(sqrt(fabs(MAX(1.0, 1.0))));   

  timer_clear(0);
  timer_clear(1);
  timer_clear(2);
  timer_start(0);

  t1 = A;
  vranlc(0, &t1, A, x);

  //--------------------------------------------------------------------
  //  Compute AN = A ^ (2 * NK) (mod 2^46).
  //--------------------------------------------------------------------

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);
  }

  an = t1;
  tt = S;
  gc = 0.0;
  sx = 0.0;
  sy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;
  }

  //--------------------------------------------------------------------
  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others
  //--------------------------------------------------------------------

  k_offset = -1;

  for (k = 1; k <= np; k++) {
    kk = k_offset + k; 
    t1 = S;
    t2 = an;

    // Find starting seed t1 for this kk.

    for (i = 1; i <= 100; i++) {
      ik = kk / 2;
      if ((2 * ik) != kk) t3 = randlc(&t1, t2);
      if (ik == 0) break;
      t3 = randlc(&t2, t2);
      kk = ik;
    }

    //--------------------------------------------------------------------
    //  Compute uniform pseudorandom numbers.
    //--------------------------------------------------------------------
    if (timers_enabled) timer_start(2);
    vranlc(2 * NK, &t1, A, x);
    if (timers_enabled) timer_stop(2);

    //--------------------------------------------------------------------
    //  Compute Gaussian deviates by acceptance-rejection method and 
    //  tally counts in concentri//square annuli.  This loop is not 
    //  vectorizable. 
    //--------------------------------------------------------------------
    if (timers_enabled) timer_start(1);

    for (i = 0; i < NK; i++) {
      x1 = 2.0 * x[2*i] - 1.0;
      x2 = 2.0 * x[2*i+1] - 1.0;
      t1 = x1 * x1 + x2 * x2;
      if (t1 <= 1.0) {
        t2   = sqrt(-2.0 * log(t1) / t1);
        t3   = (x1 * t2);
        t4   = (x2 * t2);
        l    = MAX(fabs(t3), fabs(t4));
        q[l] = q[l] + 1.0;
        sx   = sx + t3;
        sy   = sy + t4;
      }
    }

    if (timers_enabled) timer_stop(1);
  }

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];
  }

  timer_stop(0);
  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;
  }

  if (verified) {
    sx_err = fabs((sx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((sy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));
  }

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", sx, sy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);
  }

  print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      verified, NPBVERSION, COMPILETIME, CS1,
      CS2, CS3, CS4, CS5, CS6, CS7);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(1);
    printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(2);
    printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
  }

  return 0;
}
Esempio n. 21
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char *argv[])
{
  size_t temp;
  cl_int ecode;
  char *source_dir = "FT";
  if (argc > 1) source_dir = argv[1];

#ifdef TIMER_DETAIL
  if (timers_enabled) {
    int i;
    for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i);
  }
#endif

  DTIMER_START(T_OPENCL_API);

  // 1. Find the default device type and get a device for the device type
  device_type = clu_GetDefaultDeviceType();
  device      = clu_GetAvailableDevice(device_type);
  device_name = clu_GetDeviceName(device);

  // Device information
  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_ITEM_SIZES,
                          sizeof(work_item_sizes),
                          &work_item_sizes,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_WORK_GROUP_SIZE,
                          sizeof(size_t),
                          &max_work_group_size,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // FIXME: The below values are experimental.
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  ecode = clGetDeviceInfo(device,
                          CL_DEVICE_MAX_COMPUTE_UNITS,
                          sizeof(cl_uint),
                          &max_compute_units,
                          NULL);
  clu_CheckError(ecode, "clGetDiviceInfo()");

  // 2. Create a context for the specified device
  context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = clCreateCommandQueue(context, device, 0, &ecode);
  clu_CheckError(ecode, "clCreateCommandQueue()");

  DTIMER_STOP(T_OPENCL_API);

  // 4. Build the program
  DTIMER_START(T_BUILD);
  char *source_file;
  char build_option[50];
  if (device_type == CL_DEVICE_TYPE_CPU) {
    source_file = "ft_cpu.cl";
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU;
    EVOLVE_DIM = EVOLVE_DIM_CPU;
    CFFTS_DIM = CFFTS_DIM_CPU;

  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    char vendor[50];
    ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL);
    clu_CheckError(ecode, "clGetDeviceInfo()");
    if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) {
      source_file = "ft_gpu_nvidia.cl";
      CFFTS_LSIZE = 32;
    } else {
      source_file = "ft_gpu.cl";
      CFFTS_LSIZE = 64;
    }

    sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu",
            CLASS, CFFTS_LSIZE);

    COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU;
    EVOLVE_DIM = EVOLVE_DIM_GPU;
    CFFTS_DIM = CFFTS_DIM_GPU;

  } else {
    fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n");
    exit(EXIT_FAILURE);
  }
  program = clu_MakeProgram(context, device, source_dir, source_file,
                            build_option);
  DTIMER_STOP(T_BUILD);

  // 5. Create buffers
  DTIMER_START(T_BUFFER_CREATE);
  m_u = clCreateBuffer(context,
                       CL_MEM_READ_ONLY,
                       sizeof(dcomplex) * NXP,
                       NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u");

  m_u0 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u0");

  m_u1 = clCreateBuffer(context,
                        CL_MEM_READ_WRITE,
                        sizeof(dcomplex) * NTOTALP,
                        NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_u1");

  m_twiddle = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double) * NTOTALP,
                             NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_twiddle");

  if (device_type == CL_DEVICE_TYPE_CPU) {
    size_t ty1_size, ty2_size;
    if (CFFTS_DIM == 2) {
      ty1_size = sizeof(dcomplex) * NX * NY * NZ;
      ty2_size = sizeof(dcomplex) * NX * NY * NZ;
    } else {
      fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM);
      exit(EXIT_FAILURE);
    }

    m_ty1 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty1_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty1");

    m_ty2 = clCreateBuffer(context,
                           CL_MEM_READ_WRITE,
                           ty2_size,
                           NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ty2");
  }

  if (device_type == CL_DEVICE_TYPE_CPU) {
    temp = 1024 / max_compute_units;
    checksum_local_ws  = temp == 0 ? 1 : temp;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  } else if (device_type == CL_DEVICE_TYPE_GPU) {
    checksum_local_ws  = 32;
    checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws);
  }
  checksum_wg_num = checksum_global_ws / checksum_local_ws;
  m_chk = clCreateBuffer(context,
                         CL_MEM_READ_WRITE,
                         sizeof(dcomplex) * checksum_wg_num,
                         NULL, &ecode);
  clu_CheckError(ecode, "clCreateBuffer() for m_chk");
  g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num);
  DTIMER_STOP(T_BUFFER_CREATE);

  // 6. Create kernels
  DTIMER_START(T_OPENCL_API);
  double ap = -4.0 * ALPHA * PI * PI;
  int d1 = dims[0];
  int d2 = dims[1];
  int d3 = dims[2];

  k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_indexmap");
  ecode  = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle);
  ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3);
  ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap);
  clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap");
  if (COMPUTE_IMAP_DIM == 3) {
    cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d2 < temp ? d2 : temp;
    temp = temp / cimap_lws[1];
    cimap_lws[2] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]);
    cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]);
  } else if (COMPUTE_IMAP_DIM == 2) {
    cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0];
    temp = max_work_group_size / cimap_lws[0];
    cimap_lws[1] = d3 < temp ? d3 : temp;

    cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]);
    cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]);
  } else {
    //temp = d3 / max_compute_units;
    temp = 1;
    cimap_lws[0] = temp == 0 ? 1 : temp;
    cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]);
  }

  k_compute_ics = clCreateKernel(program,
                                 "compute_initial_conditions", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions");
  ecode  = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1);
  ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2);
  ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3);
  clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions");

  k_cffts1 = clCreateKernel(program, "cffts1", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts1");
  ecode  = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts1");

  k_cffts2 = clCreateKernel(program, "cffts2", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts2");
  ecode  = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts2");

  k_cffts3 = clCreateKernel(program, "cffts3", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for cffts3");
  ecode  = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u);
  if (device_type == CL_DEVICE_TYPE_CPU) {
    ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1);
    ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2);
  }
  clu_CheckError(ecode, "clSetKernelArg() for k_cffts3");

  k_evolve = clCreateKernel(program, "evolve", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for evolve");

  k_checksum = clCreateKernel(program, "checksum", &ecode);
  clu_CheckError(ecode, "clCreateKernel() for checksum");
  ecode  = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk);
  ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws,
                          NULL);
  ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]);
  ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]);
  clu_CheckError(ecode, "clSetKernelArg() for checksum");
  DTIMER_STOP(T_OPENCL_API);
}
Esempio n. 22
0
int main(int argc, char** argv )
{

    int             i, iteration, itemp;
    int		    nthreads = 1;
    double          timecounter, maxtime;



/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
                break;
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
                break;
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
                break;
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
                break;
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];
                break;
        };

        

/*  Printout initial NPB info */
    printf( "\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
	    " - IS Benchmark\n\n" );
    printf( " Size:  %d  (class %c)\n", TOTAL_KEYS, CLASS );
    printf( " Iterations:   %d\n", MAX_ITERATIONS );

/*  Initialize timer  */             
    timer_clear( 0 );

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq( 314159265.00,                    /* Random number gen seed */
                1220703125.00 );                 /* Random number gen mult */


/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
    all data and code pages and respective tables */
#pragma omp parallel    
    rank( 1 );  

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */             
    timer_start( 0 );


/*  This is the main iteration */
    
#pragma omp parallel private(iteration)    
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
#pragma omp master	
        if( CLASS != 'S' ) printf( "        %d\n", iteration );
	
        rank( iteration );
	
#if defined(_OPENMP)	
#pragma omp master
	nthreads = omp_get_num_threads();
#endif /* _OPENMP */	
    }

/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );


/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */
    full_verify();



/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     CLASS,
                     TOTAL_KEYS,
                     0,
                     0,
                     MAX_ITERATIONS,
		     nthreads,
                     timecounter,
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                                                  /timecounter/1000000.,
                     "keys ranked", 
                     passed_verification,
                     NPBVERSION,
                     COMPILETIME,
                     CC,
                     CLINK,
                     C_LIB,
                     C_INC,
                     CFLAGS,
                     CLINKFLAGS,
		     "randlc2");


  return 0;
         /**************************/
}        /*  E N D  P R O G R A M  */
Esempio n. 23
0
int main( int argc, char **argv )
{
    MPI_Init(&argc,&argv);
    
    INT_TYPE chunk;
    int ini, fim;
    int             i, j, iteration, timer_on;
    double          timecounter;
    FILE            *fp;
    int myrank;
    MPI_Status      st;

    MPI_Comm_rank(MPI_COMM_WORLD,&myrank); 
    MPI_Comm_size(MPI_COMM_WORLD,&NUM_THREADS);

    if (myrank == 0) {
/*  Initialize timers  */
    timer_on = 0;            
    if ((fp = fopen("timer.flag", "r")) != NULL) {
        fclose(fp);
        timer_on = 1;
    }
    timer_clear( 0 );
    if (timer_on) {
        timer_clear( 1 );
        timer_clear( 2 );
        timer_clear( 3 );
    }

    if (timer_on) timer_start( 3 );


/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
        {
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
                break;
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
                break;
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
                break;
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
                break;
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];
                break;
            case 'D':
                test_index_array[i] = D_test_index_array[i];
                test_rank_array[i]  = D_test_rank_array[i];
                break;
        };

        

/*  Printout initial NPB info */
    printf
      ( "\n\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\n\n" );
    printf( " Size:  %ld  (class %c)\n", (long)TOTAL_KEYS, CLASS );
    printf( " Number of available threads:  %d\n", NUM_THREADS );
    printf( " Iterations:   %d\n", MAX_ITERATIONS );

    if (timer_on) timer_start( 1 );
  }

    R23 = pow(2, -23);
    T23 = pow(2,  23);
    R46 = pow(2, -46);
    T46 = pow(2,  46);

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq(myrank);

    if (myrank == 0) {
      // sincronizar resultados
      for (i = 1; i < NUM_THREADS; i++) {
        chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS;
        ini = chunk * i;
        fim = ini + chunk;
        if ( fim > NUM_KEYS ) {
          fim = NUM_KEYS;
        }
        MPI_Recv( &aux_key_array[ini], (fim - ini), MPI_INT, i, 0, MPI_COMM_WORLD, &st );
        for (j = ini; j < fim; j++) {
          key_array[j] = aux_key_array[j];
        }
      }
    } else {
      chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS;
      ini = chunk * myrank;
      fim = ini + chunk;
      if ( fim > NUM_KEYS ) {
        fim = NUM_KEYS;
      }
      // enviar resultados
      MPI_Send( &key_array[ini], (fim - ini), MPI_INT, 0, 0, MPI_COMM_WORLD );
    }


    if (myrank == 0) {
    if (timer_on) {
      timer_stop( 1 );
    }


/*  Do one interation for free (i.e., untimed) to guarantee initialization of  
    all data and code pages and respective tables */
    rank( 1 );  

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */             
    timer_start( 0 );


/*  This is the main iteration */
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
    {
        if( CLASS != 'S' ) printf( "        %d\n", iteration );
        rank( iteration );
    }


/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );


/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */
    if (timer_on) timer_start( 2 );
    full_verify();
    if (timer_on) timer_stop( 2 );

    if (timer_on) timer_stop( 3 );


/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     CLASS,
                     (int)(TOTAL_KEYS/64),
                     64,
                     0,
                     MAX_ITERATIONS,
                     timecounter,
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                                                  /timecounter/1000000.,
                     "keys ranked", 
                     passed_verification,
                     NPBVERSION,
                     COMPILETIME,
                     CC,
                     CLINK,
                     C_LIB,
                     C_INC,
                     CFLAGS,
                     CLINKFLAGS );


/*  Print additional timers  */
    if (timer_on) {
       double t_total, t_percent;

       t_total = timer_read( 3 );
       printf("\nAdditional timers -\n");
       printf(" Total execution: %8.3f\n", t_total);
       if (t_total == 0.0) t_total = 1.0;
       timecounter = timer_read(1);
       t_percent = timecounter/t_total * 100.;
       printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(0);
       t_percent = timecounter/t_total * 100.;
       printf(" Benchmarking   : %8.3f (%5.2f%%)\n", timecounter, t_percent);
       timecounter = timer_read(2);
       t_percent = timecounter/t_total * 100.;
       printf(" Sorting        : %8.3f (%5.2f%%)\n", timecounter, t_percent);
    }
  }

    MPI_Finalize();
    return 0;
         /**************************/
}        /*  E N D  P R O G R A M  */
int main(int argc, char *argv[])
{
  int step, ie, iside, i, j, k;
  double mflops, tmax, nelt_tot = 0.0;
  char Class;
  logical ifmortar = false, verified;

  double t2, trecs[t_last+1];
  char *t_names[t_last+1];

	//--------------------------------------------------------------------
	// Initialize NUMA control
	//--------------------------------------------------------------------
	numa_initialize_env(NUMA_MIGRATE_EXISTING);

  //---------------------------------------------------------------------
  // Read input file (if it exists), else take
  // defaults from parameters
  //---------------------------------------------------------------------
  FILE *fp;
  if ((fp = fopen("timer.flag", "r")) != NULL) {
    timeron = true;
    t_names[t_total] = "total";
    t_names[t_init] = "init";
    t_names[t_convect] = "convect";
    t_names[t_transfb_c] = "transfb_c";
    t_names[t_diffusion] = "diffusion";
    t_names[t_transf] = "transf";
    t_names[t_transfb] = "transfb";
    t_names[t_adaptation] = "adaptation";
    t_names[t_transf2] = "transf+b";
    t_names[t_add2] = "add2";
    fclose(fp);
  } else {
    timeron = false;
  }

  printf("\n\n NAS Parallel Benchmarks (NPB3.3-OMP-C) - UA Benchmark\n\n");

  if ((fp = fopen("inputua.data", "r")) != NULL) {
    int result;
    printf(" Reading from input file inputua.data\n");
    result = fscanf(fp, "%d", &fre);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d", &niter);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%d", &nmxh);
    while (fgetc(fp) != '\n');
    result = fscanf(fp, "%lf", &alpha);
    Class = 'U';
    fclose(fp);
  } else {
    printf(" No input file inputua.data. Using compiled defaults\n");
    fre   = FRE_DEFAULT;
    niter = NITER_DEFAULT;
    nmxh  = NMXH_DEFAULT;
    alpha = ALPHA_DEFAULT;
    Class = CLASS_DEFAULT;
  }

  dlmin = pow(0.5, REFINE_MAX);
  dtime = 0.04*dlmin;

  printf(" Levels of refinement:        %8d\n", REFINE_MAX);
  printf(" Adaptation frequency:        %8d\n", fre);
  printf(" Time steps:                  %8d    dt: %15.6E\n", niter, dtime);
  printf(" CG iterations:               %8d\n", nmxh);
  printf(" Heat source radius:          %8.4f\n", alpha);
  printf(" Number of available threads: %8d\n", omp_get_max_threads());
  printf("\n");

  top_constants();

  for (i = 1; i <= t_last; i++) {
    timer_clear(i);
  }
  if (timeron) timer_start(t_init);

  // set up initial mesh (single element) and solution (all zero)
  create_initial_grid();

  r_init_omp((double *)ta1, ntot, 0.0);
  nr_init_omp((int *)sje, 4*6*nelt, -1);

  init_locks();

  // compute tables of coefficients and weights      
  coef();
  geom1();

  // compute the discrete laplacian operators
  setdef();

  // prepare for the preconditioner
  setpcmo_pre();

  // refine initial mesh and do some preliminary work
  time = 0.0;
  mortar();
  prepwork();
  adaptation(&ifmortar, 0);
  if (timeron) timer_stop(t_init);

  timer_clear(1);

  time = 0.0;
  for (step = 0; step <= niter; step++) {
    if (step == 1) {
      // reset the solution and start the timer, keep track of total no elms
      r_init((double *)ta1, ntot, 0.0);

      time = 0.0;
      nelt_tot = 0.0;
      for (i = 1; i <= t_last; i++) {
        if (i != t_init) timer_clear(i);
      }
      timer_start(1);
    }

    // advance the convection step 
    convect(ifmortar);

    if (timeron) timer_start(t_transf2);
    // prepare the intital guess for cg
    transf(tmort, (double *)ta1);

    // compute residual for diffusion term based on intital guess

    // compute the left hand side of equation, lapacian t
    #pragma omp parallel default(shared) private(ie,k,j,i) 
    {
    #pragma omp for
    for (ie = 0; ie < nelt; ie++) {
      laplacian(ta2[ie], ta1[ie], size_e[ie]);
    }

    // compute the residual 
    #pragma omp for
    for (ie = 0; ie < nelt; ie++) {
      for (k = 0; k < LX1; k++) {
        for (j = 0; j < LX1; j++) {
          for (i = 0; i < LX1; i++) {
            trhs[ie][k][j][i] = trhs[ie][k][j][i] - ta2[ie][k][j][i];
          }
        }
      }
    }
    } //end parallel

    // get the residual on mortar 
    transfb(rmor, (double *)trhs);
    if (timeron) timer_stop(t_transf2);

    // apply boundary condition: zero out the residual on domain boundaries

    // apply boundary conidtion to trhs
    #pragma omp parallel for default(shared) private(ie,iside)
    for (ie = 0; ie < nelt; ie++) {
      for (iside = 0; iside < NSIDES; iside++) {
        if (cbc[ie][iside] == 0) {
          facev(trhs[ie], iside, 0.0);
        }
      }
    }
    // apply boundary condition to rmor
    col2(rmor, tmmor, nmor);

    // call the conjugate gradient iterative solver
    diffusion(ifmortar);

    // add convection and diffusion
    if (timeron) timer_start(t_add2);
    add2((double *)ta1, (double *)t, ntot);
    if (timeron) timer_stop(t_add2);

    // perform mesh adaptation
    time = time + dtime;
    if ((step != 0) && (step/fre*fre == step)) {
      if (step != niter) {
        adaptation(&ifmortar, step);
      }
    } else {
      ifmortar = false;
    }
    nelt_tot = nelt_tot + (double)(nelt);
  }

  timer_stop(1);
  tmax = timer_read(1);

  verify(&Class, &verified);

  // compute millions of collocation points advanced per second.
  // diffusion: nmxh advancements, convection: 1 advancement
  mflops = nelt_tot*(double)(LX1*LX1*LX1*(nmxh+1))/(tmax*1.e6);

  print_results("UA", Class, REFINE_MAX, 0, 0, niter, 
                tmax, mflops, "    coll. point advanced", 
                verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, 
                CS6, "(none)");

  //---------------------------------------------------------------------
  // More timers
  //---------------------------------------------------------------------
  if (timeron) {
    for (i = 1; i <= t_last; i++) {
      trecs[i] = timer_read(i);
    }
    if (tmax == 0.0) tmax = 1.0;

    printf("  SECTION     Time (secs)\n");
    for (i = 1; i <= t_last; i++) {
      printf("  %-10s:%9.3f  (%6.2f%%)\n",
          t_names[i], trecs[i], trecs[i]*100./tmax);
      if (i == t_transfb_c) {
        t2 = trecs[t_convect] - trecs[t_transfb_c];
        printf("    --> %11s:%9.3f  (%6.2f%%)\n", 
            "sub-convect", t2, t2*100./tmax);
      } else if (i == t_transfb) {
        t2 = trecs[t_diffusion] - trecs[t_transf] - trecs[t_transfb];
        printf("    --> %11s:%9.3f  (%6.2f%%)\n", 
            "sub-diffuse", t2, t2*100./tmax);
      }
    }
  }

	//--------------------------------------------------------------------
	// Teardown NUMA control
	//--------------------------------------------------------------------
	numa_shutdown();

  return 0;
}
Esempio n. 25
0
//---------------------------------------------------------------------
// Set up the OpenCL environment.
//---------------------------------------------------------------------
static void setup_opencl(int argc, char **argv)
{
  int i, c;
//  size_t temp;
  cl_int ecode = 0;
  char *source_dir = ".";  //FIXME
  int num_subs = DEFAULT_NUM_SUBS;
  int num_cus;
  int sqrt_num_command_queues;

  if (argc > 1) source_dir = argv[1];

  devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_subs);

  if (timeron) {
    timer_clear(TIMER_OPENCL);
    timer_clear(TIMER_BUILD);
    timer_clear(TIMER_BUFFER);
    timer_clear(TIMER_RELEASE);

    timer_start(TIMER_OPENCL);
  }

  // 1. Find the default device type and get a device for the device type
  //    Then, create sub-devices from the parent device.
  //device_type = CL_DEVICE_TYPE_CPU;
  device_type = CL_DEVICE_TYPE_ALL;
  //device_type = CL_DEVICE_TYPE_GPU;
  if(argc <= 2) {
    printf("Device type argument missing!\n");
	exit(-1);
  }
  char *device_type_str = argv[2];
  if(strcmp(device_type_str, "CPU") == 0 || strcmp(device_type_str, "cpu") == 0) {
  	device_type = CL_DEVICE_TYPE_CPU;
  } else if(strcmp(device_type_str, "GPU") == 0 || strcmp(device_type_str, "gpu") == 0) {
  	device_type = CL_DEVICE_TYPE_GPU;
  } else if(strcmp(device_type_str, "ALL") == 0 || strcmp(device_type_str, "all") == 0) {
  	device_type = CL_DEVICE_TYPE_ALL;
  } else {
    printf("Unsupported device type!\n");
	exit(-1);
  }
  cl_uint num_command_queues = 4;
  char *num_command_queues_str = getenv("SNU_NPB_COMMAND_QUEUES");
  if(num_command_queues_str != NULL)
  	num_command_queues = atoi(num_command_queues_str);

  cl_platform_id platform;
  ecode = clGetPlatformIDs(1, &platform, NULL);
  clu_CheckError(ecode, "clGetPlatformIDs()");

  ecode = clGetDeviceIDs(platform, device_type, 0, NULL, &num_devices);
  clu_CheckError(ecode, "clGetDeviceIDs()");

  //num_devices = 2;
  ecode = clGetDeviceIDs(platform, device_type, num_devices, devices, NULL);
  clu_CheckError(ecode, "clGetDeviceIDs()");
  cl_device_id tmp_dev;

  work_item_sizes[0] = work_item_sizes[1] = work_item_sizes[2] = 1024;
  max_work_group_size = 1024;
  max_compute_units = 22;

  sqrt_num_command_queues = (int)(sqrt((double)(num_command_queues) + 0.00001));
  if (num_command_queues != sqrt_num_command_queues * sqrt_num_command_queues) {
    fprintf(stderr, "Number of devices is not a square of some integer\n");
    exit(EXIT_FAILURE);
  }

  ncells = (int)(sqrt((double)(num_command_queues) + 0.00001));
  MAX_CELL_DIM = ((PROBLEM_SIZE/ncells)+1);
  IMAX = MAX_CELL_DIM;
  JMAX = MAX_CELL_DIM;
  KMAX = MAX_CELL_DIM;
  IMAXP = (IMAX/2*2+1);
  JMAXP = (JMAX/2*2+1);
  //---------------------------------------------------------------------
  // +1 at end to avoid zero length arrays for 1 node
  //---------------------------------------------------------------------
  BUF_SIZE = (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60*2+1);


  // FIXME
  if (max_work_group_size > 64) {
    max_work_group_size = 64;
    int i;
    for (i = 0; i < 3; i++) {
      if (work_item_sizes[i] > 64) {
        work_item_sizes[i] = 64;
      }
    }
  }

  // 2. Create a context for devices
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
	cl_context_properties props[5] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platform,
		CL_CONTEXT_SCHEDULER,
		CL_CONTEXT_SCHEDULER_CODE_SEGMENTED_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_FIRST_EPOCH_BASED_PERF_MODEL,
		//CL_CONTEXT_SCHEDULER_ALL_EPOCH_BASED_PERF_MODEL,
		0 };
  context = clCreateContext(props, 
#elif defined(SOCL_OPTIMIZATIONS)
	cl_context_properties props[5] = {
		CL_CONTEXT_PLATFORM,
		(cl_context_properties)platform,
		CL_CONTEXT_SCHEDULER_SOCL,
		"dmda",
		//"random",
		0 };
  context = clCreateContext(props, 
#else
  context = clCreateContext(NULL, 
#endif
                            num_devices,
                            devices,
                            NULL, NULL, &ecode);
  clu_CheckError(ecode, "clCreateContext()");

  // 3. Create a command queue
  cmd_queue = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_command_queues*3);
  for (i = 0; i < num_command_queues * 2; i++) {
    //cmd_queue[i] = clCreateCommandQueue(context, devices[(i / 2) % num_devices], 
#ifdef SOCL_OPTIMIZATIONS
    cmd_queue[i] = clCreateCommandQueue(context, NULL, 
#else    
	cmd_queue[i] = clCreateCommandQueue(context, devices[num_devices - 1 - ((i / 2) % num_devices)],
#endif
   // cmd_queue[i] = clCreateCommandQueue(context, devices[0], 
#ifdef MINIMD_SNUCL_OPTIMIZATIONS
	0,
	//		CL_QUEUE_AUTO_DEVICE_SELECTION | 
	//		CL_QUEUE_ITERATIVE, 
			//CL_QUEUE_COMPUTE_INTENSIVE,
#else
	0,
#endif
	&ecode);
    clu_CheckError(ecode, "clCreateCommandQueue()");
  }

  // 4. Build the program
  if (timeron) timer_start(TIMER_BUILD);
  char *source_file = "sp_kernel.cl";
  //p_program = clu_MakeProgram(context, devices, source_dir, source_file, build_option);
  p_program = clu_CreateProgram(context, source_dir, source_file);
  for(i = 0; i < num_devices; i++) {
	  char build_option[200] = {0};
	  cl_device_type cur_device_type;
	  cl_int err = clGetDeviceInfo(devices[i],
			  CL_DEVICE_TYPE,
			  sizeof(cl_device_type),
			  &cur_device_type,
			  NULL);
	  clu_CheckError(err, "clGetDeviceInfo()");
  if (cur_device_type == CL_DEVICE_TYPE_CPU) {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP);
  } else {
    sprintf(build_option, "-I. -DCLASS=%d -DUSE_GPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP);
  }

  clu_MakeProgram(p_program, 1, &devices[i], source_dir, build_option);
  //clu_MakeProgram(p_program, num_devices, devices, source_dir, build_option);
  }
  num_devices = num_command_queues;
  program = (cl_program *)malloc(sizeof(cl_program) * num_devices);
  for (i = 0; i < num_devices; i++) {
    program[i] = p_program;
  }
  if (timeron) timer_stop(TIMER_BUILD);

  // 5. Create kernels
  size_t asize = sizeof(cl_kernel) * num_devices;
  k_initialize1 = (cl_kernel *)malloc(asize);
  k_initialize2 = (cl_kernel *)malloc(asize);
  k_initialize3 = (cl_kernel *)malloc(asize);
  k_initialize4 = (cl_kernel *)malloc(asize);
  k_initialize5 = (cl_kernel *)malloc(asize);
  k_initialize6 = (cl_kernel *)malloc(asize);
  k_initialize7 = (cl_kernel *)malloc(asize);
  k_initialize8 = (cl_kernel *)malloc(asize);
  k_lhsinit = (cl_kernel *)malloc(asize);
  k_exact_rhs1 = (cl_kernel *)malloc(asize);
  k_exact_rhs2 = (cl_kernel *)malloc(asize);
  k_exact_rhs3 = (cl_kernel *)malloc(asize);
  k_exact_rhs4 = (cl_kernel *)malloc(asize);
  k_exact_rhs5 = (cl_kernel *)malloc(asize);
  k_copy_faces1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_copy_faces6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_compute_rhs6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_txinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsx = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_ninvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_x_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsy = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_pinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_y_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_lhsz = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_tzetar = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_z_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_add = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS);
  k_error_norm = (cl_kernel *)malloc(asize);
  k_rhs_norm = (cl_kernel *)malloc(asize);

  for (i = 0; i < num_devices; i++) {
    k_initialize1[i] = clCreateKernel(program[i], "initialize1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize1");

    k_initialize2[i] = clCreateKernel(program[i], "initialize2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize2");

    k_initialize3[i] = clCreateKernel(program[i], "initialize3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize3");

    k_initialize4[i] = clCreateKernel(program[i], "initialize4", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize4");

    k_initialize5[i] = clCreateKernel(program[i], "initialize5", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize5");

    k_initialize6[i] = clCreateKernel(program[i], "initialize6", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize6");

    k_initialize7[i] = clCreateKernel(program[i], "initialize7", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize7");

    k_initialize8[i] = clCreateKernel(program[i], "initialize8", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for initialize8");

    k_lhsinit[i] = clCreateKernel(program[i], "lhsinit", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for lhsinit");

    k_exact_rhs1[i] = clCreateKernel(program[i], "exact_rhs1", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs1");

    k_exact_rhs2[i] = clCreateKernel(program[i], "exact_rhs2", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs2");

    k_exact_rhs3[i] = clCreateKernel(program[i], "exact_rhs3", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs3");

    k_exact_rhs4[i] = clCreateKernel(program[i], "exact_rhs4", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs4");

    k_exact_rhs5[i] = clCreateKernel(program[i], "exact_rhs5", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for exact_rhs5");

    for (c = 0; c < MAXCELLS; c++) {
      k_copy_faces1[i][c] = clCreateKernel(program[i], "copy_faces1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces1");

      k_copy_faces2[i][c] = clCreateKernel(program[i], "copy_faces2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces2");

      k_copy_faces3[i][c] = clCreateKernel(program[i], "copy_faces3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces3");

      k_copy_faces4[i][c] = clCreateKernel(program[i], "copy_faces4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces4");

      k_copy_faces5[i][c] = clCreateKernel(program[i], "copy_faces5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces5");

      k_copy_faces6[i][c] = clCreateKernel(program[i], "copy_faces6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for copy_faces6");

      k_compute_rhs1[i][c] = clCreateKernel(program[i], "compute_rhs1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs1");

      k_compute_rhs2[i][c] = clCreateKernel(program[i], "compute_rhs2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs2");

      k_compute_rhs3[i][c] = clCreateKernel(program[i], "compute_rhs3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs3");

      k_compute_rhs4[i][c] = clCreateKernel(program[i], "compute_rhs4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs4");

      k_compute_rhs5[i][c] = clCreateKernel(program[i], "compute_rhs5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs5");

      k_compute_rhs6[i][c] = clCreateKernel(program[i], "compute_rhs6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for compute_rhs6");

      k_txinvr[i][c] = clCreateKernel(program[i], "txinvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for txinvr");

      k_lhsx[i][c] = clCreateKernel(program[i], "lhsx", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsx");

      k_ninvr[i][c] = clCreateKernel(program[i], "ninvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for ninvr");

      k_x_solve1[i][c] = clCreateKernel(program[i], "x_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve1");

      k_x_solve2[i][c] = clCreateKernel(program[i], "x_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve2");

      k_x_solve3[i][c] = clCreateKernel(program[i], "x_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve3");

      k_x_solve4[i][c] = clCreateKernel(program[i], "x_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve4");

      k_x_solve5[i][c] = clCreateKernel(program[i], "x_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve5");

      k_x_solve6[i][c] = clCreateKernel(program[i], "x_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for x_solve6");

      k_lhsy[i][c] = clCreateKernel(program[i], "lhsy", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsy");

      k_pinvr[i][c] = clCreateKernel(program[i], "pinvr", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for pinvr");

      k_y_solve1[i][c] = clCreateKernel(program[i], "y_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve1");

      k_y_solve2[i][c] = clCreateKernel(program[i], "y_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve2");

      k_y_solve3[i][c] = clCreateKernel(program[i], "y_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve3");

      k_y_solve4[i][c] = clCreateKernel(program[i], "y_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve4");

      k_y_solve5[i][c] = clCreateKernel(program[i], "y_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve5");

      k_y_solve6[i][c] = clCreateKernel(program[i], "y_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for y_solve6");

      k_lhsz[i][c] = clCreateKernel(program[i], "lhsz", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for lhsz");

      k_tzetar[i][c] = clCreateKernel(program[i], "tzetar", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for tzetar");

      k_z_solve1[i][c] = clCreateKernel(program[i], "z_solve1", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve1");

      k_z_solve2[i][c] = clCreateKernel(program[i], "z_solve2", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve2");

      k_z_solve3[i][c] = clCreateKernel(program[i], "z_solve3", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve3");

      k_z_solve4[i][c] = clCreateKernel(program[i], "z_solve4", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve4");

      k_z_solve5[i][c] = clCreateKernel(program[i], "z_solve5", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve5");

      k_z_solve6[i][c] = clCreateKernel(program[i], "z_solve6", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for z_solve6");

      k_add[i][c] = clCreateKernel(program[i], "add", &ecode);
      clu_CheckError(ecode, "clCreateKernel() for add");
    }

    k_error_norm[i] = clCreateKernel(program[i], "error_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for error_norm");

    k_rhs_norm[i] = clCreateKernel(program[i], "rhs_norm", &ecode);
    clu_CheckError(ecode, "clCreateKernel() for rhs_norm");
  }

  // 6. Create buffers
  if (timeron) timer_start(TIMER_BUFFER);

  asize = sizeof(cl_mem) * num_devices;

  m_u = (cl_mem *)malloc(asize);
  m_us = (cl_mem *)malloc(asize);
  m_vs = (cl_mem *)malloc(asize);
  m_ws = (cl_mem *)malloc(asize);
  m_qs = (cl_mem *)malloc(asize);
  m_ainv = (cl_mem *)malloc(asize);
  m_rho_i = (cl_mem *)malloc(asize);
  m_speed = (cl_mem *)malloc(asize);
  m_square = (cl_mem *)malloc(asize);
  m_rhs = (cl_mem *)malloc(asize);
  m_forcing = (cl_mem *)malloc(asize);
  m_lhs = (cl_mem *)malloc(asize);
  m_in_buffer = (cl_mem *)malloc(asize);
  m_out_buffer = (cl_mem *)malloc(asize);

  m_ce = (cl_mem *)malloc(asize);

  for (i = 0; i < num_devices; i++) {
    m_u[i] = clCreateBuffer(context,
                            CL_MEM_READ_WRITE,
                            sizeof(double)*MAXCELLS*(KMAX+4)*(JMAXP+4)*(IMAXP+4)*5,
                            NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_u");

    m_us[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_us");

    m_vs[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_vs");

    m_ws[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ws");

    m_qs[i] = clCreateBuffer(context,
                             CL_MEM_READ_WRITE,
                             sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_qs");

    m_ainv[i] = clCreateBuffer(context,
                               CL_MEM_READ_WRITE,
                               sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                               NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ainv");

    m_rho_i[i] = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rho_i");

    m_speed[i] = clCreateBuffer(context,
                                CL_MEM_READ_WRITE,
                                sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_speed");

    m_square[i] = clCreateBuffer(context,
                                 CL_MEM_READ_WRITE,
                                 sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2),
                                 NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_square");

    m_rhs[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5,
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_rhs");

    m_forcing[i] = clCreateBuffer(context,
                                  CL_MEM_READ_WRITE,
                                  sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5,
                                  NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_forcing");

    m_lhs[i] = clCreateBuffer(context,
                              CL_MEM_READ_WRITE,
                              sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*15,
                              NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_lhs");

    m_in_buffer[i] = clCreateBuffer(context,
                                    CL_MEM_READ_WRITE,
                                    sizeof(double)*BUF_SIZE,
                                    NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_in_buffer");

    m_out_buffer[i] = clCreateBuffer(context,
                                     CL_MEM_READ_WRITE,
                                     sizeof(double)*BUF_SIZE,
                                     NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_out_buffer");

    m_ce[i] = clCreateBuffer(context,
                             CL_MEM_READ_ONLY,
                             sizeof(double)*5*13,
                             NULL, &ecode);
    clu_CheckError(ecode, "clCreateBuffer() for m_ce");
  }

  if (timeron) timer_stop(TIMER_BUFFER);

  if (timeron) timer_stop(TIMER_OPENCL);
}
Esempio n. 26
0
File: cg.c Progetto: 8l/rose
int main(int argc, char **argv) {
    int	i, j, k, it;
    int nthreads = 1;
    double zeta;
    double rnorm;
    double norm_temp11;
    double norm_temp12;
    double t, mflops;
    char cclass;
    boolean verified;
    double zeta_verify_value, epsilon;

    firstrow = 1;
    lastrow  = NA;
    firstcol = 1;
    lastcol  = NA;

    if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) {
	cclass = 'S';
	zeta_verify_value = 8.5971775078648;
    } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) {
	cclass = 'W';
	zeta_verify_value = 10.362595087124;
    } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) {
	cclass = 'A';
	zeta_verify_value = 17.130235054029;
    } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) {
	cclass = 'B';
	zeta_verify_value = 22.712745482631;
    } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) {
	cclass = 'C';
	zeta_verify_value = 28.973605592845;
    } else {
	cclass = 'U';
    }

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
           " - CG Benchmark\n");
    printf(" Size: %10d\n", NA);
    printf(" Iterations: %5d\n", NITER);

    naa = NA;
    nzz = NZ;

/*--------------------------------------------------------------------
c  Initialize random number generator
c-------------------------------------------------------------------*/
    tran    = 314159265.0;
    amult   = 1220703125.0;
    zeta    = randlc( &tran, amult );

/*--------------------------------------------------------------------
c  
c-------------------------------------------------------------------*/
    makea(naa, nzz, a, colidx, rowstr, NONZER,
          firstrow, lastrow, firstcol, lastcol, 
	  RCOND, arow, acol, aelt, v, iv, SHIFT);
    
/*---------------------------------------------------------------------
c  Note: as a result of the above call to makea:
c        values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
c        values of colidx which are col indexes go from firstcol --> lastcol
c        So:
c        Shift the col index vals from actual (firstcol --> lastcol ) 
c        to local, i.e., (1 --> lastcol-firstcol+1)
c---------------------------------------------------------------------*/
#pragma omp parallel private(it,i,j,k)
{	
#pragma omp for nowait
    for (j = 1; j <= lastrow - firstrow + 1; j++) {
	for (k = rowstr[j]; k < rowstr[j+1]; k++) {
            colidx[k] = colidx[k] - firstcol + 1;
	}
    }

/*--------------------------------------------------------------------
c  set starting vector to (1, 1, .... 1)
c-------------------------------------------------------------------*/
#pragma omp for nowait
    for (i = 1; i <= NA+1; i++) {
	x[i] = 1.0;
    }
#pragma omp single
    zeta  = 0.0;

/*-------------------------------------------------------------------
c---->
c  Do one iteration untimed to init all code and data page tables
c---->                    (then reinit, start timing, to niter its)
c-------------------------------------------------------------------*/

    for (it = 1; it <= 1; it++) {

/*--------------------------------------------------------------------
c  The call to the conjugate gradient routine:
c-------------------------------------------------------------------*/
	conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm);

/*--------------------------------------------------------------------
c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
c-------------------------------------------------------------------*/
#pragma omp single
{	
	norm_temp11 = 0.0;
	norm_temp12 = 0.0;
} /* end single */

#pragma omp for reduction(+:norm_temp11,norm_temp12)
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            norm_temp11 = norm_temp11 + x[j]*z[j];
            norm_temp12 = norm_temp12 + z[j]*z[j];
	}
#pragma omp single
	norm_temp12 = 1.0 / sqrt( norm_temp12 );

/*--------------------------------------------------------------------
c  Normalize z to obtain x
c-------------------------------------------------------------------*/
#pragma omp for
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            x[j] = norm_temp12*z[j];
	}
	
    } /* end of do one iteration untimed */

/*--------------------------------------------------------------------
c  set starting vector to (1, 1, .... 1)
c-------------------------------------------------------------------*/
#pragma omp for nowait
    for (i = 1; i <= NA+1; i++) {
         x[i] = 1.0;
    }
#pragma omp single    
    zeta  = 0.0;

} /* end parallel */

    timer_clear( 1 );
    timer_start( 1 );

/*--------------------------------------------------------------------
c---->
c  Main Iteration for inverse power method
c---->
c-------------------------------------------------------------------*/

#pragma omp parallel private(it,i,j,k)
{
    for (it = 1; it <= NITER; it++) {

/*--------------------------------------------------------------------
c  The call to the conjugate gradient routine:
c-------------------------------------------------------------------*/
	conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm);

/*--------------------------------------------------------------------
c  zeta = shift + 1/(x.z)
c  So, first: (x.z)
c  Also, find norm of z
c  So, first: (z.z)
c-------------------------------------------------------------------*/
#pragma omp single
{	
	norm_temp11 = 0.0;
	norm_temp12 = 0.0;
} /* end single */

#pragma omp for reduction(+:norm_temp11,norm_temp12)
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            norm_temp11 = norm_temp11 + x[j]*z[j];
            norm_temp12 = norm_temp12 + z[j]*z[j];
	}

#pragma omp single
{	
	norm_temp12 = 1.0 / sqrt( norm_temp12 );

	zeta = SHIFT + 1.0 / norm_temp11;
} /* end single */

#pragma omp master
{
	if( it == 1 ) {
            printf("   iteration           ||r||                 zeta\n");
	}
	printf("    %5d       %20.14e%20.13e\n", it, rnorm, zeta);
} /* end master */

/*--------------------------------------------------------------------
c  Normalize z to obtain x
c-------------------------------------------------------------------*/
#pragma omp for 
	for (j = 1; j <= lastcol-firstcol+1; j++) {
            x[j] = norm_temp12*z[j];
	}
    } /* end of main iter inv pow meth */

#if defined(_OPENMP)
#pragma omp master
    nthreads = omp_get_num_threads();
#endif /* _OPENMP */
} /* end parallel */

    timer_stop( 1 );

/*--------------------------------------------------------------------
c  End of timed section
c-------------------------------------------------------------------*/

    t = timer_read( 1 );

    printf(" Benchmark completed\n");

    epsilon = 1.0e-10;
    if (cclass != 'U') {
	if (fabs(zeta - zeta_verify_value) <= epsilon) {
            verified = TRUE;
	    printf(" VERIFICATION SUCCESSFUL\n");
	    printf(" Zeta is    %20.12e\n", zeta);
	    printf(" Error is   %20.12e\n", zeta - zeta_verify_value);
	} else {
            verified = FALSE;
	    printf(" VERIFICATION FAILED\n");
	    printf(" Zeta                %20.12e\n", zeta);
	    printf(" The correct zeta is %20.12e\n", zeta_verify_value);
	}
    } else {
	verified = FALSE;
	printf(" Problem size unknown\n");
	printf(" NO VERIFICATION PERFORMED\n");
    }

    if ( t != 0.0 ) {
	mflops = (2.0*NITER*NA)
	    * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 )
	    / t / 1000000.0;
    } else {
	mflops = 0.0;
    }

    c_print_results("CG", cclass, NA, 0, 0, NITER, nthreads, t, 
		    mflops, "          floating point", 
		    verified, NPBVERSION, COMPILETIME,
		    CS1, CS2, CS3, CS4, CS5, CS6, CS7);
}