static int encode_ogg (cdrom_drive *drive, rip_opts_s *rip_opts, text_tag_s *text_tag, int track, int tracktot, char *filename, char **filenames) { ogg_stream_state os; ogg_page og; ogg_packet op; vorbis_dsp_state vd; vorbis_block vb; vorbis_info vi; long samplesdone = 0; int sector = 0, last_sector = 0; long bytes_written = 0, packetsdone = 0; double time_elapsed = 0.0; int ret = 0; time_t *timer; double time; int serialno = rand (); vorbis_comment vc; long total_samples_per_channel = 0; int channels = 2; int eos = 0; long rate = 44100; FILE *out = fopen (filename, "w+"); timer = timer_start (); if (!rip_opts->managed && (rip_opts->min_bitrate > 0 || rip_opts->max_bitrate > 0)) { log_msg ("Min or max bitrate requires managed", FL, FN, LN); return -1; } if (rip_opts->bitrate < 0 && rip_opts->min_bitrate < 0 && rip_opts->max_bitrate < 0) { rip_opts->quality_set = 1; } start_func (filename, rip_opts->bitrate, rip_opts->quality, rip_opts->quality_set, rip_opts->managed, rip_opts->min_bitrate, rip_opts->max_bitrate); vorbis_info_init (&vi); if (rip_opts->quality_set > 0) { if (vorbis_encode_setup_vbr (&vi, channels, rate, rip_opts->quality)) { log_msg ("Couldn't initialize vorbis_info", FL, FN, LN); vorbis_info_clear (&vi); return -1; } /* two options here, max or min bitrate */ if (rip_opts->max_bitrate > 0 || rip_opts->min_bitrate > 0) { struct ovectl_ratemanage_arg ai; vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_GET, &ai); ai.bitrate_hard_min = rip_opts->min_bitrate; ai.bitrate_hard_max = rip_opts->max_bitrate; ai.management_active = 1; vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_SET, &ai); } } else { if (vorbis_encode_setup_managed (&vi, channels, rate, rip_opts->max_bitrate > 0 ? rip_opts->max_bitrate * 1000 : -1, rip_opts->bitrate * 1000, rip_opts->min_bitrate > 0 ? rip_opts->min_bitrate * 1000 : -1)) { log_msg ("Mode init failed, encode setup managed", FL, FN, LN); vorbis_info_clear (&vi); return -1; } } if (rip_opts->managed && rip_opts->bitrate < 0) { vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_AVG, NULL); } else if (!rip_opts->managed) { vorbis_encode_ctl (&vi, OV_ECTL_RATEMANAGE_SET, NULL); } /* set advanced encoder options */ vorbis_encode_setup_init (&vi); vorbis_analysis_init (&vd, &vi); vorbis_block_init (&vd, &vb); ogg_stream_init (&os, serialno); { ogg_packet header_main; ogg_packet header_comments; ogg_packet header_codebooks; int result; char buf[32]; vorbis_comment_init (&vc); vorbis_comment_add_tag (&vc, "title", text_tag->songname); vorbis_comment_add_tag (&vc, "artist", text_tag->artistname); vorbis_comment_add_tag (&vc, "album", text_tag->albumname); vorbis_comment_add_tag (&vc, "genre", text_tag->genre); snprintf (buf, 32, "%d", text_tag->year); vorbis_comment_add_tag (&vc, "date", buf); snprintf (buf, 32, "%02d", text_tag->track); vorbis_comment_add_tag (&vc, "tracknumber", buf); vorbis_analysis_headerout (&vd, &vc, &header_main, &header_comments, &header_codebooks); ogg_stream_packetin (&os, &header_main); ogg_stream_packetin (&os, &header_comments); ogg_stream_packetin (&os, &header_codebooks); while ((result = ogg_stream_flush (&os, &og))) { if (result == 0) break; ret = write_page (&og, out); if (ret != og.header_len + og.body_len) { log_msg ("Failed writing data to output stream", FL, FN, LN); ret = -1; } } sector = cdda_track_firstsector (drive, track); last_sector = cdda_track_lastsector (drive, track); total_samples_per_channel = (last_sector - sector) * (CD_FRAMESAMPLES / 2); int eos = 0; while (!eos) { signed char *buffer = (signed char *)malloc (CD_FRAMESIZE_RAW * READ_SECTORS); //use this variable as a s**t long sectors_read = last_sector - sector; if (sectors_read > READ_SECTORS) sectors_read = READ_SECTORS; sectors_read = cdda_read (drive, (signed char *)buffer, sector, sectors_read); int i; if (sectors_read == 0) { vorbis_analysis_wrote (&vd, 0); } else { float **vorbbuf = vorbis_analysis_buffer (&vd, CD_FRAMESIZE_RAW * sectors_read); for (i = 0; i < (CD_FRAMESIZE_RAW * sectors_read) / 4; i++) { vorbbuf[0][i] = ((buffer[i * 4 + 1] << 8) | (0x00ff&(int)buffer[i * 4])) / 32768.f; vorbbuf[1][i] = ((buffer[i * 4 + 3] << 8) | (0x00ff&(int)buffer[i * 4 + 2])) / 32768.f; } int samples_read = sectors_read * (CD_FRAMESAMPLES / 2); samplesdone += samples_read; // progress every 60 pages if (packetsdone >= 60) { packetsdone = 0; time = timer_time (timer); update_statistics (total_samples_per_channel, samplesdone, time, track, tracktot, 0, filenames); } vorbis_analysis_wrote (&vd, i); } free (buffer); sector += sectors_read; while (vorbis_analysis_blockout (&vd, &vb) == 1) { vorbis_analysis (&vb, &op); vorbis_bitrate_addblock (&vb); while (vorbis_bitrate_flushpacket (&vd, &op)) { ogg_stream_packetin (&os, &op); packetsdone++; while (!eos) { int result = ogg_stream_pageout (&os, &og); if (result == 0) { break; } ret = write_page (&og, out); if (ret != og.header_len + og.body_len) { log_msg ("Failed writing data to output stream", FL, FN, LN); ret = -1; } else bytes_written += ret; if (ogg_page_eos (&og)) { eos = 1; } } } } } } ret = 0; update_statistics (total_samples_per_channel, samplesdone, time, track, tracktot, 0, filenames); ogg_stream_clear (&os); vorbis_block_clear (&vb); vorbis_dsp_clear (&vd); vorbis_comment_clear (&vc); vorbis_info_clear (&vi); vorbis_comment_clear (&vc); time_elapsed = timer_time (timer); end_func (time_elapsed, rate, samplesdone, bytes_written); timer_clear (timer); fclose (out); return ret; }
void hwtimer_arch_unset(short timer) { timer_clear(hw_timers[timer/2], (timer%2)); }
static int realmain(void *carg) { unsigned arg = (uintptr_t)carg; /*c------------------------------------------------------------------- c-------------------------------------------------------------------*/ int i, ierr; /*------------------------------------------------------------------ c u0, u1, u2 are the main arrays in the problem. c Depending on the decomposition, these arrays will have different c dimensions. To accomodate all possibilities, we allocate them as c one-dimensional arrays and pass them to subroutines for different c views c - u0 contains the initial (transformed) initial condition c - u1 and u2 are working arrays c - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the c time evolution operator. c-----------------------------------------------------------------*/ /*-------------------------------------------------------------------- c Large arrays are in common so that they are allocated on the c heap rather than the stack. This common block is not c referenced directly anywhere else. Padding is to avoid accidental c cache problems, since all array sizes are powers of two. c-------------------------------------------------------------------*/ static dcomplex u0[NZ][NY][NX]; static dcomplex pad1[3]; static dcomplex u1[NZ][NY][NX]; static dcomplex pad2[3]; static dcomplex u2[NZ][NY][NX]; static dcomplex pad3[3]; static int indexmap[NZ][NY][NX]; int iter; int nthreads = 1; double total_time, mflops; boolean verified; char class; omp_set_num_threads(arg); /*-------------------------------------------------------------------- c Run the entire problem once to make sure all data is touched. c This reduces variable startup costs, which is important for such a c short benchmark. The other NPB 2 implementations are similar. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } setup(); #pragma omp parallel { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } fft(1, u1, u0); } /* end parallel */ /*-------------------------------------------------------------------- c Start over from the beginning. Note that all operations must c be timed, in contrast to other benchmarks. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } timer_start(T_TOTAL); if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP); #pragma omp parallel private(iter) firstprivate(niter) { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_SETUP); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(1, u1, u0); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } for (iter = 1; iter <= niter; iter++) { if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_EVOLVE); } evolve(u0, u1, iter, indexmap, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_EVOLVE); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(-1, u1, u2); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_CHECKSUM); } checksum(iter, u2, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_CHECKSUM); } } #pragma omp single verify(NX, NY, NZ, niter, &verified, &class); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop(T_TOTAL); total_time = timer_read(T_TOTAL); if( total_time != 0.0) { mflops = 1.0e-6*(double)(NTOTAL) * (14.8157+7.19641*log((double)(NTOTAL)) + (5.23518+7.21113*log((double)(NTOTAL)))*niter) /total_time; } else { mflops = 0.0; } #ifdef BOMP backend_create_time(arg); #endif printf("Computetime %d %f\n", arg, total_time); printf("client done\n"); /* c_print_results("FT", class, NX, NY, NZ, niter, nthreads, */ /* total_time, mflops, " floating point", verified, */ /* NPBVERSION, COMPILETIME, */ /* CS1, CS2, CS3, CS4, CS5, CS6, CS7); */ if (TIMERS_ENABLED == TRUE) print_timers(); }
int main(int argc, char *argv[]) { int i; int iter; double total_time, mflops; logical verified; char Class; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } setup(); setup_opencl(argc, argv); init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); fft_init(dims[0]); fft(1, &m_u1, &m_u0); //--------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } timer_start(T_total); if (timers_enabled) timer_start(T_setup); DTIMER_START(T_compute_im); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_im); DTIMER_START(T_compute_ics); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_ics); DTIMER_START(T_fft_init); fft_init(dims[0]); DTIMER_STOP(T_fft_init); if (timers_enabled) timer_stop(T_setup); if (timers_enabled) timer_start(T_fft); fft(1, &m_u1, &m_u0); if (timers_enabled) timer_stop(T_fft); for (iter = 1; iter <= niter; iter++) { if (timers_enabled) timer_start(T_evolve); evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_evolve); if (timers_enabled) timer_start(T_fft); fft(-1, &m_u1, &m_u1); if (timers_enabled) timer_stop(T_fft); if (timers_enabled) timer_start(T_checksum); checksum(iter, &m_u1, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_checksum); } verify(NX, NY, NZ, niter, &verified, &Class); timer_stop(T_total); total_time = timer_read(T_total); if (total_time != 0.0) { mflops = 1.0e-6 * (double)NTOTAL * (14.8157 + 7.19641 * log((double)NTOTAL) + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter) / total_time; } else { mflops = 0.0; } c_print_results("FT", Class, NX, NY, NZ, niter, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) print_timers(); release_opencl(); fflush(stdout); return 0; }
int main(int argc, char **argv) #endif { int i, niter, step; double mflops, t, tmax; logical verified; char class; double tsum[t_last+2], t1[t_last+2], tming[t_last+2], tmaxg[t_last+2]; char *t_recs[t_last+2] = { "total", "rhs", "xsolve", "ysolve", "zsolve", "bpack", "exch", "xcomm", "ycomm", "zcomm", " totcomp", " totcomm" }; //--------------------------------------------------------------------- // Root node reads input file (if it exists) else takes // defaults from parameters //--------------------------------------------------------------------- printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL-MD) - SP Benchmark\n\n"); FILE *fp; fp = fopen("timer.flag", "r"); timeron = false; if (fp != NULL) { timeron = true; fclose(fp); } if ((fp = fopen("inputsp.data", "r")) != NULL) { int result; printf(" Reading from input file inputsp.data\n"); result = fscanf(fp, "%d", &niter); while (fgetc(fp) != '\n'); result = fscanf(fp, "%*f"); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d%d%d", &grid_points[0], &grid_points[1], &grid_points[2]); fclose(fp); } else { printf(" No input file inputsp.data. Using compiled defaults\n"); niter = NITER_DEFAULT; grid_points[0] = PROBLEM_SIZE; grid_points[1] = PROBLEM_SIZE; grid_points[2] = PROBLEM_SIZE; } setup_opencl(argc, argv); printf(" Size: %4dx%4dx%4d\n", grid_points[0], grid_points[1], grid_points[2]); printf(" Iterations: %4d", niter); if (num_devices != MAXCELLS*MAXCELLS) printf(" WARNING: compiled for %5d devices \n", MAXCELLS*MAXCELLS); printf(" Number of active devices: %5d\n\n", num_devices); make_set(); for (i = 0; i < t_last; i++) { timer_clear(i); } set_constants(); initialize(); lhsinit(); exact_rhs(); compute_buffer_size(5); set_kernel_args(); //--------------------------------------------------------------------- // do one time step to touch all code, and reinitialize //--------------------------------------------------------------------- #ifdef MINIMD_SNUCL_OPTIMIZATIONS // set cmd queue property for(i = 0; i < num_devices; i++) { clSetCommandQueueProperty(cmd_queue[i], CL_QUEUE_AUTO_DEVICE_SELECTION | //CL_QUEUE_ITERATIVE | CL_QUEUE_COMPUTE_INTENSIVE, true, NULL); } #endif adi(); #ifdef MINIMD_SNUCL_OPTIMIZATIONS for(i = 0; i < num_devices; i++) { clSetCommandQueueProperty(cmd_queue[i], 0, true, NULL); } #endif initialize(); //--------------------------------------------------------------------- // Synchronize before placing time stamp //--------------------------------------------------------------------- for (i = 0; i < t_last; i++) { timer_clear(i); } timer_clear(0); timer_start(0); for (step = 1; step <= niter; step++) { if ((step % 20) == 0 || step == 1) { printf(" Time step %4d\n", step); } adi(); } timer_stop(0); t = timer_read(0); verify(niter, &class, &verified); tmax = t; if( tmax != 0.0 ) { mflops = (881.174*(double)( PROBLEM_SIZE*PROBLEM_SIZE*PROBLEM_SIZE ) -4683.91*(double)( PROBLEM_SIZE*PROBLEM_SIZE ) +11484.5*(double)( PROBLEM_SIZE ) -19272.4) * (double)( niter ) / (tmax*1000000.0); } else { mflops = 0.0; } c_print_results("SP", class, grid_points[0], grid_points[1], grid_points[2], niter, tmax, mflops, " floating point", verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name, num_devices); if (timeron) { /* for (i = 0; i < t_last; i++) { t1[i] = timer_read(i); } t1[t_xsolve] = t1[t_xsolve] - t1[t_xcomm]; t1[t_ysolve] = t1[t_ysolve] - t1[t_ycomm]; t1[t_zsolve] = t1[t_zsolve] - t1[t_zcomm]; t1[t_last+2] = t1[t_xcomm]+t1[t_ycomm]+t1[t_zcomm]+t1[t_exch]; t1[t_last+1] = t1[t_total] - t1[t_last+2]; MPI_Reduce(&t1, tsum, t_last+2, dp_type, MPI_SUM, 0, comm_setup); MPI_Reduce(&t1, tming, t_last+2, dp_type, MPI_MIN, 0, comm_setup); MPI_Reduce(&t1, tmaxg, t_last+2, dp_type, MPI_MAX, 0, comm_setup); if (node == 0) { printf(" nprocs =%6d minimum maximum average\n", total_nodes); for (i = 0; i < t_last+2; i++) { tsum[i] = tsum[i] / total_nodes; printf(" timer %2d(%8s) : %10.4f %10.4f %10.4f\n", i+1, t_recs[i], tming[i], tmaxg[i], tsum[i]); } } */ } release_opencl(); return 0; }
void hwtimer_arch_unset(short timer) { timer_clear(HW_TIMER, timer); }
int main(int argc,char **argv ){ int my_rank,comm_size; int i; DGraph *dg=NULL; int verified=0, featnum=0; double bytes_sent=2.0,tot_time=0.0; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &my_rank ); MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); TRACE_smpi_set_category ("begin"); if(argc!=2|| ( strncmp(argv[1],"BH",2)!=0 &&strncmp(argv[1],"WH",2)!=0 &&strncmp(argv[1],"SH",2)!=0 ) ){ if(my_rank==0){ fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n"); fprintf(stderr,"** Where \n - N is integer number of MPI processes\n"); fprintf(stderr," - S is the class S, W, or A \n"); fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n"); fprintf(stderr," - the number of MPI processes N should not be be less than \n"); fprintf(stderr," the number of nodes in the graph\n"); } MPI_Finalize(); exit(0); } if(strncmp(argv[1],"BH",2)==0){ dg=buildBH(CLASS); }else if(strncmp(argv[1],"WH",2)==0){ dg=buildWH(CLASS); }else if(strncmp(argv[1],"SH",2)==0){ dg=buildSH(CLASS); } if(timer_on&&dg->numNodes+1>timers_tot){ timer_on=0; if(my_rank==0) fprintf(stderr,"Not enough timers. Node timeing is off. \n"); } if(dg->numNodes>comm_size){ if(my_rank==0){ fprintf(stderr,"** The number of MPI processes should not be less than \n"); fprintf(stderr,"** the number of nodes in the graph\n"); fprintf(stderr,"** Number of MPI processes = %d\n",comm_size); fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes); } MPI_Finalize(); exit(0); } for(i=0;i<dg->numNodes;i++){ dg->node[i]->address=i; } if( my_rank == 0 ){ printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" ); graphShow(dg,0); timer_clear(0); timer_start(0); } verified=ProcessNodes(dg,my_rank); TRACE_smpi_set_category ("end"); featnum=NUM_SAMPLES*fielddim; bytes_sent=featnum*dg->numArcs; bytes_sent/=1048576; if(my_rank==0){ timer_stop(0); tot_time=timer_read(0); c_print_results( dg->name, CLASS, featnum, 0, 0, dg->numNodes, 0, comm_size, tot_time, bytes_sent/tot_time, "bytes transmitted", verified, NPBVERSION, COMPILETIME, MPICC, CLINK, CMPI_LIB, CMPI_INC, CFLAGS, CLINKFLAGS ); } MPI_Finalize(); return 1; }
int main( int argc, char **argv ) { int i, iteration; double timecounter; FILE *fp; cl_int ecode; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* set up the OpenCL environment. */ setup_opencl(argc, argv); /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; DTIMER_START(T_BUFFER_WRITE); ecode = clEnqueueWriteBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_WRITE); if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_READ); /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "", clu_GetDeviceTypeName(device_type), device_name); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } release_opencl(); fflush(stdout); return 0; /**************************/ } /* E N D P R O G R A M */
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { cl_int ecode; char *source_dir = "IS"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timer_on) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 256) { max_work_group_size = 256; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 256) { work_item_sizes[i] = 256; } } } // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "is_cpu.cl"; sprintf(build_option, "-DCLASS=%d -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK_GROUP_SIZE = 1; RANK_GLOBAL_SIZE = RANK_GROUP_SIZE * 128; RANK1_GROUP_SIZE = 1; RANK1_GLOBAL_SIZE = RANK1_GROUP_SIZE * RANK_GLOBAL_SIZE;; RANK2_GROUP_SIZE = RANK_GROUP_SIZE; RANK2_GLOBAL_SIZE = RANK_GLOBAL_SIZE;; FV2_GROUP_SIZE = 64; FV2_GLOBAL_SIZE = FV2_GROUP_SIZE * 256; } else if (device_type == CL_DEVICE_TYPE_GPU) { source_file = "is_gpu.cl"; sprintf(build_option, "-DCLASS=\'%c\' -I.", CLASS); CREATE_SEQ_GROUP_SIZE = 64; CREATE_SEQ_GLOBAL_SIZE = CREATE_SEQ_GROUP_SIZE * 256; RANK1_GROUP_SIZE = work_item_sizes[0]; RANK1_GLOBAL_SIZE = MAX_KEY; RANK2_GROUP_SIZE = work_item_sizes[0]; RANK2_GLOBAL_SIZE = NUM_KEYS; FV2_GROUP_SIZE = work_item_sizes[0]; FV2_GLOBAL_SIZE = NUM_KEYS; } else { fprintf(stderr, "%s: not supported.", clu_GetDeviceTypeName(device_type)); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_key_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_array"); m_key_buff1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1"); m_key_buff2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * SIZE_OF_BUFFERS, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff2"); size_t test_array_size = sizeof(INT_TYPE) * TEST_ARRAY_SIZE; m_index_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_index_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_index_array"); m_rank_array = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, test_array_size, test_rank_array, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rank_array"); m_partial_vals = clCreateBuffer(context, CL_MEM_WRITE_ONLY, test_array_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_partial_vals"); m_passed_verification = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_passed_verification"); if (device_type == CL_DEVICE_TYPE_GPU) { m_key_scan = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * MAX_KEY, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_key_buff1_scan"); m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(INT_TYPE) * work_item_sizes[0], NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_sum"); } else { size_t bs_size = RANK_GLOBAL_SIZE * sizeof(INT_TYPE) * NUM_BUCKETS; m_bucket_size = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_size"); m_bucket_ptrs = clCreateBuffer(context, CL_MEM_READ_WRITE, bs_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_bucket_ptrs"); } DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); k_rank0 = clCreateKernel(program, "rank0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank0"); ecode = clSetKernelArg(k_rank0, 0, sizeof(cl_mem), (void*)&m_key_array); ecode |= clSetKernelArg(k_rank0, 1, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank0, 2, sizeof(cl_mem), (void*)&m_index_array); clu_CheckError(ecode, "clSetKernelArg() for rank0"); if (device_type == CL_DEVICE_TYPE_GPU) { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem), (void*)&m_key_buff1); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem), (void*)&m_key_array); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3_0 = clCreateKernel(program, "rank3_0", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_0"); ecode = clSetKernelArg(k_rank3_0, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_0, 2, sizeof(cl_mem),(void*)&m_sum); ecode |= clSetKernelArg(k_rank3_0, 3, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_0"); k_rank3_1 = clCreateKernel(program, "rank3_1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_1"); ecode = clSetKernelArg(k_rank3_1, 0, sizeof(cl_mem), (void*)&m_sum); ecode = clSetKernelArg(k_rank3_1, 1, sizeof(cl_mem), (void*)&m_sum); ecode |= clSetKernelArg(k_rank3_1, 2, sizeof(INT_TYPE) * work_item_sizes[0] * 2, NULL); clu_CheckError(ecode, "clSetKernelArg() for rank3_1"); k_rank3_2 = clCreateKernel(program, "rank3_2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3_2"); ecode = clSetKernelArg(k_rank3_2, 0, sizeof(cl_mem),(void*)&m_key_buff1); ecode = clSetKernelArg(k_rank3_2, 1, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3_2, 2, sizeof(cl_mem),(void*)&m_sum); clu_CheckError(ecode, "clSetKernelArg() for rank3_2"); } else { k_rank1 = clCreateKernel(program, "rank1", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank1"); ecode = clSetKernelArg(k_rank1, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank1, 1, sizeof(cl_mem),(void*)&m_bucket_size); clu_CheckError(ecode, "clSetKernelArg() for rank1"); k_rank2 = clCreateKernel(program, "rank2", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank2"); ecode = clSetKernelArg(k_rank2, 0, sizeof(cl_mem),(void*)&m_key_array); ecode |= clSetKernelArg(k_rank2, 1, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank2, 2, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank2, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank2"); k_rank3 = clCreateKernel(program, "rank3", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank3"); ecode = clSetKernelArg(k_rank3, 0, sizeof(cl_mem),(void*)&m_bucket_size); ecode |= clSetKernelArg(k_rank3, 1, sizeof(cl_mem),(void*)&m_bucket_ptrs); ecode |= clSetKernelArg(k_rank3, 2, sizeof(cl_mem),(void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank3, 3, sizeof(cl_mem),(void*)&m_key_buff2); clu_CheckError(ecode, "clSetKernelArg() for rank3"); } k_rank4 = clCreateKernel(program, "rank4", &ecode); clu_CheckError(ecode, "clCreateKernel() for rank4"); ecode = clSetKernelArg(k_rank4, 0, sizeof(cl_mem), (void*)&m_partial_vals); ecode |= clSetKernelArg(k_rank4, 1, sizeof(cl_mem), (void*)&m_key_buff1); ecode |= clSetKernelArg(k_rank4, 2, sizeof(cl_mem), (void*)&m_rank_array); ecode |= clSetKernelArg(k_rank4, 3, sizeof(cl_mem), (void*)&m_passed_verification); clu_CheckError(ecode, "clSetKernelArg() for rank4"); DTIMER_STOP(T_OPENCL_API); }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- void setup_opencl(int argc, char *argv[]) { cl_int err_code; char *source_dir = "EP"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &err_code); clu_CheckError(err_code, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &err_code); clu_CheckError(err_code, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[30]; sprintf(build_option, "-DM=%d -I.", M); if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ep_cpu.cl"; GROUP_SIZE = 16; } else { source_file = "ep_gpu.cl"; GROUP_SIZE = 64; } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); gq_size = np / GROUP_SIZE * NQ * sizeof(double); gsx_size = np / GROUP_SIZE * sizeof(double); gsy_size = np / GROUP_SIZE * sizeof(double); pgq = clCreateBuffer(context, CL_MEM_READ_WRITE, gq_size, NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgq"); pgsx = clCreateBuffer(context, CL_MEM_READ_WRITE, gsx_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsx"); pgsy = clCreateBuffer(context, CL_MEM_READ_WRITE, gsy_size,NULL, &err_code); clu_CheckError(err_code, "clCreateBuffer() for pgsy"); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create a kernel DTIMER_START(T_OPENCL_API); kernel = clCreateKernel(program, "embar", &err_code); clu_CheckError(err_code, "clCreateKernel()"); DTIMER_STOP(T_OPENCL_API); }
int main (int argc, char **argv) { //auto double *_ppthd_x; auto double Mops; auto double t1; auto double t2; auto double t3; auto double t4; auto double x1; auto double x2; auto double sx; auto double sy; auto double tm; auto double an; auto double tt; auto double gc; auto double dum[3]; auto int np; auto int ierr; auto int node; auto int no_nodes; auto int i; auto int ik; auto int kk; auto int l; auto int k; auto int nit; auto int ierrcode; auto int no_large_nodes; auto int np_add; auto int k_offset; auto int j; auto int nthreads; auto int verified; auto char size[14]; int status = 0; _ompc_init(argc,argv); //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x)))); (*(dum)) = (1.0); (*((dum) + (1))) = (1.0); (*((dum) + (2))) = (1.0); (nthreads) = (1); # 84 "ep.c" printf ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012"); # 86 "ep.c" sprintf (size, "%12.0f", pow (2.0, (28) + (1))); # 87 "ep.c" for ((j) = (13); (j) >= (1); (j)--) { # 88 "ep.c" if ((((int) (*((size) + (j))))) == (46)) { (*((size) + (j))) = (((char) (32))); } } # 90 "ep.c" printf (" Number of random numbers generated: %13s\012", size); # 92 "ep.c" (verified) = (0); # 99 "ep.c" (np) = ((1) << ((28) - (16))); # 107 "ep.c" vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2)); # 108 "ep.c" (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2)))); # 109 "ep.c" for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++) { x[i] = (-(1.0E99)); //(*((_ppthd_x) + (i))) = (-(1.0E99)); } # 110 "ep.c" (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0))))); # 112 "ep.c" timer_clear (1); # 113 "ep.c" timer_clear (2); # 114 "ep.c" timer_clear (3); # 115 "ep.c" timer_start (1); # 117 "ep.c" vranlc (0, &(t1), 1.220703125E9, x); //vranlc (0, &(t1), 1.220703125E9, _ppthd_x); # 121 "ep.c" (t1) = (1.220703125E9); # 123 "ep.c" for ((i) = (1); (i) <= ((16) + (1)); (i)++) { # 124 "ep.c" (t2) = (randlc (&(t1), t1)); } # 127 "ep.c" (an) = (t1); # 128 "ep.c" (tt) = (2.71828183E8); # 129 "ep.c" (gc) = (0.0); # 130 "ep.c" (sx) = (0.0); # 131 "ep.c" (sy) = (0.0); # 133 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 134 "ep.c" (*((q) + (i))) = (0.0); } # 142 "ep.c" (k_offset) = (-(1)); { auto void *__ompc_argv[6]; (*(__ompc_argv)) = (((void *) (&sx))); (*((__ompc_argv) + (1))) = (((void *) (&sy))); (*((__ompc_argv) + (2))) = (((void *) (&np))); (*((__ompc_argv) + (3))) = (((void *) (&k_offset))); (*((__ompc_argv) + (4))) = (((void *) (&an))); (*((__ompc_argv) + (5))) = (((void *) (&nthreads))); _ompc_do_parallel (__ompc_func_3, __ompc_argv); } # 207 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 208 "ep.c" (gc) = ((gc) + (*((q) + (i)))); } # 211 "ep.c" timer_stop (1); # 212 "ep.c" (tm) = (timer_read (1)); # 214 "ep.c" (nit) = (0); # 215 "ep.c" if ((28) == (24)) { # 216 "ep.c" if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8))) { # 218 "ep.c" (verified) = (1); } } else # 220 "ep.c" if ((28) == (25)) { # 221 "ep.c" if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8))) { # 223 "ep.c" (verified) = (1); } } else # 225 "ep.c" if ((28) == (28)) { # 226 "ep.c" if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8))) { # 228 "ep.c" (verified) = (1); printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy); } } else # 230 "ep.c" if ((28) == (30)) { # 231 "ep.c" if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8))) { # 233 "ep.c" (verified) = (1); } } else # 235 "ep.c" if ((28) == (32)) { # 236 "ep.c" if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8))) { # 238 "ep.c" (verified) = (1); } } # 242 "ep.c" (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0)); # 244 "ep.c" printf ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012", tm, 28, gc, sx, sy); # 251 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 252 "ep.c" printf ("%3d %15.0f\012", i, *((q) + (i))); } # 255 "ep.c" c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, "2.3", "07 Aug 2006", "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm", "randdp"); # 261 "ep.c" if ((0) == (1)) { # 262 "ep.c" printf ("Total time: %f", timer_read (1)); # 263 "ep.c" printf ("Gaussian pairs: %f", timer_read (2)); # 264 "ep.c" printf ("Random numbers: %f", timer_read (3)); } }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; int nthreads = 1; boolean verified; char size[13+1]; /* character*13 */ /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ #ifndef POSIX #ifndef NOBOMP bomp_custom_init(); #endif #endif omp_set_num_threads(1); printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); for (i = 0; i < 2*NK; i++) { x[i] = -1.0e99; } printf("Reached here "); Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for ( i = 0; i <= NQ - 1; i++) { q[i] = 0.0; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; #pragma omp parallel copyin(x) { double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double qq[NQ]; /* private copy of q[0:NQ-1] */ for (i = 0; i < NQ; i++) qq[i] = 0.0; #pragma omp for reduction(+:sx,sy) schedule(static) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ if (TIMERS_ENABLED == TRUE) timer_start(3); vranlc(2*NK, &t1, A, x-1); if (TIMERS_ENABLED == TRUE) timer_stop(3); /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ if (TIMERS_ENABLED == TRUE) timer_start(2); for ( i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[l] += 1.0; /* counts */ sx = sx + t3; /* sum of Xi */ sy = sy + t4; /* sum of Yi */ } } if (TIMERS_ENABLED == TRUE) timer_stop(2); } #pragma omp critical { for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; } #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end of parallel region */ for (i = 0; i <= NQ-1; i++) { gc = gc + q[i]; } timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (TIMERS_ENABLED == TRUE) { printf("Total time: %f", timer_read(1)); printf("Gaussian pairs: %f", timer_read(2)); printf("Random numbers: %f", timer_read(3)); } }
void appft(int niter, double *total_time, logical *verified) { int i, j, k, kt, n12, n22, n32, ii, jj, kk, ii2, ik2; double ap; dcomplex exp1[NX], exp2[NY], exp3[NZ]; for (i = 1; i <= 15; i++) { timer_clear(i); } timer_start(2); compute_initial_conditions(NX, NY, NZ, xnt); CompExp(NX, exp1); CompExp(NY, exp2); CompExp(NZ, exp3); fftXYZ(1, NX, NY, NZ, xnt, (dcomplex *)y, exp1, exp2, exp3); timer_stop(2); timer_start(1); if (timers_enabled) timer_start(13); n12 = NX / 2; n22 = NY / 2; n32 = NZ / 2; ap = -4.0 * ALPHA * (PI * PI); for (i = 0; i < NZ; i++) { ii = i - (i / n32) * NZ; ii2 = ii * ii; for (k = 0; k < NY; k++) { kk = k - (k / n22) * NY; ik2 = ii2 + kk*kk; for (j = 0; j < NX; j++) { jj = j - (j / n12) * NX; twiddle[i][k][j] = exp(ap*(double)(jj*jj + ik2)); } } } if (timers_enabled) timer_stop(13); if (timers_enabled) timer_start(12); compute_initial_conditions(NX, NY, NZ, xnt); if (timers_enabled) timer_stop(12); if (timers_enabled) timer_start(15); fftXYZ(1, NX, NY, NZ, xnt, (dcomplex *)y, exp1, exp2, exp3); if (timers_enabled) timer_stop(15); for (kt = 1; kt <= niter; kt++) { if (timers_enabled) timer_start(11); evolve(NX, NY, NZ, xnt, y, twiddle); if (timers_enabled) timer_stop(11); if (timers_enabled) timer_start(15); fftXYZ(-1, NX, NY, NZ, xnt, (dcomplex *)xnt, exp1, exp2, exp3); if (timers_enabled) timer_stop(15); if (timers_enabled) timer_start(10); CalculateChecksum(&sums[kt], kt, NX, NY, NZ, xnt); if (timers_enabled) timer_stop(10); } // Verification test. if (timers_enabled) timer_start(14); verify(NX, NY, NZ, niter, sums, verified); if (timers_enabled) timer_stop(14); timer_stop(1); *total_time = timer_read(1); if (!timers_enabled) return; printf(" FT subroutine timers \n"); printf(" %26s =%9.4f\n", "FT total ", timer_read(1)); printf(" %26s =%9.4f\n", "WarmUp time ", timer_read(2)); printf(" %26s =%9.4f\n", "fftXYZ body ", timer_read(3)); printf(" %26s =%9.4f\n", "Swarztrauber ", timer_read(4)); printf(" %26s =%9.4f\n", "X time ", timer_read(7)); printf(" %26s =%9.4f\n", "Y time ", timer_read(8)); printf(" %26s =%9.4f\n", "Z time ", timer_read(9)); printf(" %26s =%9.4f\n", "CalculateChecksum ", timer_read(10)); printf(" %26s =%9.4f\n", "evolve ", timer_read(11)); printf(" %26s =%9.4f\n", "compute_initial_conditions", timer_read(12)); printf(" %26s =%9.4f\n", "twiddle ", timer_read(13)); printf(" %26s =%9.4f\n", "verify ", timer_read(14)); printf(" %26s =%9.4f\n", "fftXYZ ", timer_read(15)); printf(" %26s =%9.4f\n", "Benchmark time ", *total_time); }
int main(int argc, char *argv[]) { int i, niter, step; double navg, mflops, n3; double tmax, t, trecs[t_last+1]; logical verified; char Class; char *t_names[t_last+1]; //--------------------------------------------------------------------- // Root node reads input file (if it exists) else takes // defaults from parameters //--------------------------------------------------------------------- FILE *fp; if ((fp = fopen("timer.flag", "r")) != NULL) { timeron = true; t_names[t_total] = "total"; t_names[t_rhsx] = "rhsx"; t_names[t_rhsy] = "rhsy"; t_names[t_rhsz] = "rhsz"; t_names[t_rhs] = "rhs"; t_names[t_xsolve] = "xsolve"; t_names[t_ysolve] = "ysolve"; t_names[t_zsolve] = "zsolve"; t_names[t_rdis1] = "redist1"; t_names[t_rdis2] = "redist2"; t_names[t_add] = "add"; fclose(fp); } else { timeron = false; } printf("\n\n NAS Parallel Benchmarks (NPB3.3-OMP-C) - BT Benchmark\n\n"); if ((fp = fopen("inputbt.data", "r")) != NULL) { int result; printf(" Reading from input file inputbt.data\n"); result = fscanf(fp, "%d", &niter); while (fgetc(fp) != '\n'); result = fscanf(fp, "%lf", &dt); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d%d%d\n", &grid_points[0], &grid_points[1], &grid_points[2]); fclose(fp); } else { printf(" No input file inputbt.data. Using compiled defaults\n"); niter = NITER_DEFAULT; dt = DT_DEFAULT; grid_points[0] = PROBLEM_SIZE; grid_points[1] = PROBLEM_SIZE; grid_points[2] = PROBLEM_SIZE; } printf(" Size: %4dx%4dx%4d\n", grid_points[0], grid_points[1], grid_points[2]); printf(" Iterations: %4d dt: %11.7f\n", niter, dt); printf(" Number of available threads: %5d\n", omp_get_max_threads()); printf("\n"); if ( (grid_points[0] > IMAX) || (grid_points[1] > JMAX) || (grid_points[2] > KMAX) ) { printf(" %d, %d, %d\n", grid_points[0], grid_points[1], grid_points[2]); printf(" Problem size too big for compiled array sizes\n"); return 0; } set_constants(); for (i = 1; i <= t_last; i++) { timer_clear(i); } initialize(); exact_rhs(); //--------------------------------------------------------------------- // do one time step to touch all code, and reinitialize //--------------------------------------------------------------------- adi(); initialize(); for (i = 1; i <= t_last; i++) { timer_clear(i); } timer_start(1); // Do not do inlining, to avoid huge loops, scops are kept separate and are // distributed among files. //#pragma scop for (step = 1; step <= niter; step++) { adi(); } //#pragma endscop timer_stop(1); tmax = timer_read(1); verify(niter, &Class, &verified); n3 = 1.0*grid_points[0]*grid_points[1]*grid_points[2]; navg = (grid_points[0]+grid_points[1]+grid_points[2])/3.0; if(tmax != 0.0) { mflops = 1.0e-6 * (double)niter * (3478.8 * n3 - 17655.7 * (navg*navg) + 28023.7 * navg) / tmax; } else { mflops = 0.0; } print_results("BT", Class, grid_points[0], grid_points[1], grid_points[2], niter, tmax, mflops, " floating point", verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, "(none)"); //--------------------------------------------------------------------- // More timers //--------------------------------------------------------------------- if (timeron) { for (i = 1; i <= t_last; i++) { trecs[i] = timer_read(i); } if (tmax == 0.0) tmax = 1.0; printf(" SECTION Time (secs)\n"); for (i = 1; i <= t_last; i++) { printf(" %-8s:%9.3f (%6.2f%%)\n", t_names[i], trecs[i], trecs[i]*100./tmax); if (i == t_rhs) { t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz]; printf(" --> %8s:%9.3f (%6.2f%%)\n", "sub-rhs", t, t*100./tmax); t = trecs[t_rhs] - t; printf(" --> %8s:%9.3f (%6.2f%%)\n", "rest-rhs", t, t*100./tmax); } else if (i==t_zsolve) { t = trecs[t_zsolve] - trecs[t_rdis1] - trecs[t_rdis2]; printf(" --> %8s:%9.3f (%6.2f%%)\n", "sub-zsol", t, t*100./tmax); } else if (i==t_rdis2) { t = trecs[t_rdis1] + trecs[t_rdis2]; printf(" --> %8s:%9.3f (%6.2f%%)\n", "redist", t, t*100./tmax); } } } return 0; }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { int i; size_t temp, wg_num; cl_int ecode; char *source_dir = "LU"; if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } if (argc > 1) source_dir = argv[1]; //----------------------------------------------------------------------- // 1. Find the default device type and get a device for the device type //----------------------------------------------------------------------- device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); //////////////////////////////////////////////////////////////////////// // FIXME: The below values are experimental. size_t default_wg_size = 64; if (device_type == CL_DEVICE_TYPE_CPU) { if (CLASS == 'B') default_wg_size = 128; } else { if (CLASS == 'B') default_wg_size = 32; } if (max_work_group_size > default_wg_size) { max_work_group_size = default_wg_size; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > default_wg_size) { work_item_sizes[i] = default_wg_size; } } } if (device_type == CL_DEVICE_TYPE_CPU) { SETBV1_DIM = SETBV1_DIM_CPU; SETBV2_DIM = SETBV2_DIM_CPU; SETBV3_DIM = SETBV3_DIM_CPU; SETIV_DIM = SETIV_DIM_CPU; ERHS1_DIM = ERHS1_DIM_CPU; ERHS2_DIM = ERHS2_DIM_CPU; ERHS3_DIM = ERHS3_DIM_CPU; ERHS4_DIM = ERHS4_DIM_CPU; PINTGR1_DIM = PINTGR1_DIM_CPU; PINTGR2_DIM = PINTGR2_DIM_CPU; PINTGR3_DIM = PINTGR3_DIM_CPU; RHS_DIM = RHS_DIM_CPU; RHSX_DIM = RHSX_DIM_CPU; RHSY_DIM = RHSY_DIM_CPU; RHSZ_DIM = RHSZ_DIM_CPU; SSOR2_DIM = SSOR2_DIM_CPU; SSOR3_DIM = SSOR3_DIM_CPU; } else { SETBV1_DIM = SETBV1_DIM_GPU; SETBV2_DIM = SETBV2_DIM_GPU; SETBV3_DIM = SETBV3_DIM_GPU; SETIV_DIM = SETIV_DIM_GPU; ERHS1_DIM = ERHS1_DIM_GPU; ERHS2_DIM = ERHS2_DIM_GPU; ERHS3_DIM = ERHS3_DIM_GPU; ERHS4_DIM = ERHS4_DIM_GPU; PINTGR1_DIM = PINTGR1_DIM_GPU; PINTGR2_DIM = PINTGR2_DIM_GPU; PINTGR3_DIM = PINTGR3_DIM_GPU; RHS_DIM = RHS_DIM_GPU; RHSX_DIM = RHSX_DIM_GPU; RHSY_DIM = RHSY_DIM_GPU; RHSZ_DIM = RHSZ_DIM_GPU; SSOR2_DIM = SSOR2_DIM_GPU; SSOR3_DIM = SSOR3_DIM_GPU; } //////////////////////////////////////////////////////////////////////// //----------------------------------------------------------------------- // 2. Create a context for the specified device //----------------------------------------------------------------------- context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); //----------------------------------------------------------------------- // 3. Create command queues //----------------------------------------------------------------------- cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); max_pipeline = (jend-jst) < max_compute_units ? (jend-jst) : max_compute_units; pipe_queue = (cl_command_queue *)malloc(sizeof(cl_command_queue) * max_pipeline); for (i = 0; i < max_pipeline; i++) { pipe_queue[i] = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } //----------------------------------------------------------------------- // 4. Build programs //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUILD); char build_option[100]; if (device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); } else { sprintf(build_option, "-I. -DCLASS=\'%c\'", CLASS); } p_pre = clu_MakeProgram(context, device, source_dir, "kernel_pre.cl", build_option); p_main = clu_MakeProgram(context, device, source_dir, (device_type == CL_DEVICE_TYPE_CPU ? "kernel_main_cpu.cl" : "kernel_main_gpu.cl"), build_option); p_post = clu_MakeProgram(context, device, source_dir, "kernel_post.cl", build_option); if (timeron) timer_stop(TIMER_BUILD); //----------------------------------------------------------------------- // 5. Create buffers //----------------------------------------------------------------------- if (timeron) timer_start(TIMER_BUFFER); m_ce = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); m_u = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_rsd = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rsd"); m_frct = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_frct"); m_qs = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_rho_i = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*(ISIZ3)*(ISIZ2/2*2+1)*(ISIZ1/2*2+1), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); // workspace for work-items size_t max_work_items; if (ERHS2_DIM == 1 && ERHS3_DIM == 1 && ERHS4_DIM == 1) { max_work_items = ISIZ3; } else { max_work_items = ISIZ3*ISIZ2; } m_flux = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ1*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_flux"); if (RHSZ_DIM == 1) { max_work_items = ISIZ2; } else { max_work_items = ISIZ2*ISIZ1; } if (device_type == CL_DEVICE_TYPE_CPU) { m_utmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*6 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_utmp"); m_rtmp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*ISIZ3*5 * max_work_items, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rtmp"); } temp = (nz0-2) / max_compute_units; l2norm_lws[0] = temp == 0 ? 1 : temp; l2norm_gws[0] = clu_RoundWorkSize((size_t)(nz0-2), l2norm_lws[0]); wg_num = l2norm_gws[0] / l2norm_lws[0]; sum_size = sizeof(double) * 5 * wg_num; m_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, sum_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); if (timeron) timer_stop(TIMER_BUFFER); //----------------------------------------------------------------------- // 6. Create kernels //----------------------------------------------------------------------- k_setbv1 = clCreateKernel(p_pre, "setbv1", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv1"); ecode = clSetKernelArg(k_setbv1, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv1, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv1, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv1, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv1, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV1_DIM == 3) { setbv1_lws[0] = 5; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = nx < temp ? nx : temp; temp = temp / setbv1_lws[1]; setbv1_lws[2] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)5, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)nx, setbv1_lws[1]); setbv1_gws[2] = clu_RoundWorkSize((size_t)ny, setbv1_lws[2]); } else if (SETBV1_DIM == 2) { setbv1_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv1_lws[0]; setbv1_lws[1] = ny < temp ? ny : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)nx, setbv1_lws[0]); setbv1_gws[1] = clu_RoundWorkSize((size_t)ny, setbv1_lws[1]); } else { temp = ny / max_compute_units; setbv1_lws[0] = temp == 0 ? 1 : temp; setbv1_gws[0] = clu_RoundWorkSize((size_t)ny, setbv1_lws[0]); } k_setbv2 = clCreateKernel(p_pre, "setbv2", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv2"); ecode = clSetKernelArg(k_setbv2, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv2, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV2_DIM == 3) { setbv2_lws[0] = 5; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nx < temp ? nx : temp; temp = temp / setbv2_lws[1]; setbv2_lws[2] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)5, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nx, setbv2_lws[1]); setbv2_gws[2] = clu_RoundWorkSize((size_t)nz, setbv2_lws[2]); } else if (SETBV2_DIM == 2) { setbv2_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / setbv2_lws[0]; setbv2_lws[1] = nz < temp ? nz : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nx, setbv2_lws[0]); setbv2_gws[1] = clu_RoundWorkSize((size_t)nz, setbv2_lws[1]); } else { temp = nz / max_compute_units; setbv2_lws[0] = temp == 0 ? 1 : temp; setbv2_gws[0] = clu_RoundWorkSize((size_t)nz, setbv2_lws[0]); } k_setbv3 = clCreateKernel(p_pre, "setbv3", &ecode); clu_CheckError(ecode, "clCreateKernel() for setbv3"); ecode = clSetKernelArg(k_setbv3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setbv3, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setbv3, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setbv3, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setbv3, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETBV3_DIM == 3) { setbv3_lws[0] = 5; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = ny < temp ? ny : temp; temp = temp / setbv3_lws[1]; setbv3_lws[2] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)5, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)ny, setbv3_lws[1]); setbv3_gws[2] = clu_RoundWorkSize((size_t)nz, setbv3_lws[2]); } else if (SETBV3_DIM == 2) { setbv3_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / setbv3_lws[0]; setbv3_lws[1] = nz < temp ? nz : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)ny, setbv3_lws[0]); setbv3_gws[1] = clu_RoundWorkSize((size_t)nz, setbv3_lws[1]); } else { temp = nz / max_compute_units; setbv3_lws[0] = temp == 0 ? 1 : temp; setbv3_gws[0] = clu_RoundWorkSize((size_t)nz, setbv3_lws[0]); } k_setiv = clCreateKernel(p_pre, "setiv", &ecode); clu_CheckError(ecode, "clCreateKernel() for setiv"); ecode = clSetKernelArg(k_setiv, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_setiv, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_setiv, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_setiv, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_setiv, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SETIV_DIM == 3) { setiv_lws[0] = (nx-2) < work_item_sizes[0] ? (nx-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (ny-2) < temp ? (ny-2) : temp; temp = temp / setiv_lws[1]; setiv_lws[2] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nx-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[1]); setiv_gws[2] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[2]); } else if (SETIV_DIM == 2) { setiv_lws[0] = (ny-2) < work_item_sizes[0] ? (ny-2) : work_item_sizes[0]; temp = max_work_group_size / setiv_lws[0]; setiv_lws[1] = (nz-2) < temp ? (nz-2) : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(ny-2), setiv_lws[0]); setiv_gws[1] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[1]); } else { temp = (nz-2) / max_compute_units; setiv_lws[0] = temp == 0 ? 1 : temp; setiv_gws[0] = clu_RoundWorkSize((size_t)(nz-2), setiv_lws[0]); } k_l2norm = clCreateKernel(p_main, "l2norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_l2norm, 1, sizeof(cl_mem), &m_sum); ecode |= clSetKernelArg(k_l2norm, 2, sizeof(double)*5*l2norm_lws[0], NULL); clu_CheckError(ecode, "clSetKernelArg()"); k_rhs = clCreateKernel(p_main, "rhs", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs"); ecode = clSetKernelArg(k_rhs, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhs, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhs, 2, sizeof(cl_mem), &m_frct); ecode |= clSetKernelArg(k_rhs, 3, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhs, 4, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_rhs, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhs, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhs, 7, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (RHS_DIM == 3) { rhs_lws[0] = nx < work_item_sizes[0] ? nx : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = ny < temp ? ny : temp; temp = temp / rhs_lws[1]; rhs_lws[2] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nx, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)ny, rhs_lws[1]); rhs_gws[2] = clu_RoundWorkSize((size_t)nz, rhs_lws[2]); } else if (RHS_DIM == 2) { rhs_lws[0] = ny < work_item_sizes[0] ? ny : work_item_sizes[0]; temp = max_work_group_size / rhs_lws[0]; rhs_lws[1] = nz < temp ? nz : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)ny, rhs_lws[0]); rhs_gws[1] = clu_RoundWorkSize((size_t)nz, rhs_lws[1]); } else { //temp = nz / max_compute_units; temp = 1; rhs_lws[0] = temp == 0 ? 1 : temp; rhs_gws[0] = clu_RoundWorkSize((size_t)nz, rhs_lws[0]); } k_rhsx = clCreateKernel(p_main, "rhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsx"); ecode = clSetKernelArg(k_rhsx, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsx, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsx, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsx, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsx, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsx, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsx, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSX_DIM == 2) { rhsx_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / rhsx_lws[0]; rhsx_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsx_lws[0]); rhsx_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsx_lws[0] = temp == 0 ? 1 : temp; rhsx_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsx_lws[0]); } k_rhsy = clCreateKernel(p_main, "rhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsy"); ecode = clSetKernelArg(k_rhsy, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsy, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsy, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsy, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 7, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsy, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsy, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsy, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSY_DIM == 2) { rhsy_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsy_lws[0]; rhsy_lws[1] = (nz-2) < temp ? (nz-2) : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsy_lws[0]); rhsy_gws[1] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; rhsy_lws[0] = temp == 0 ? 1 : temp; rhsy_gws[0] = clu_RoundWorkSize((size_t)(nz-2), rhsy_lws[0]); } k_rhsz = clCreateKernel(p_main, "rhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhsz"); ecode = clSetKernelArg(k_rhsz, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_rhsz, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_rhsz, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_rhsz, 3, sizeof(cl_mem), &m_rho_i); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(cl_mem), &m_flux); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(cl_mem), &m_utmp); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(cl_mem), &m_rtmp); ecode |= clSetKernelArg(k_rhsz, 7, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 8, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 9, sizeof(int), &nz); } else { ecode |= clSetKernelArg(k_rhsz, 4, sizeof(int), &nx); ecode |= clSetKernelArg(k_rhsz, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_rhsz, 6, sizeof(int), &nz); } clu_CheckError(ecode, "clSetKernelArg()"); if (RHSZ_DIM == 2) { rhsz_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / rhsz_lws[0]; rhsz_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), rhsz_lws[0]); rhsz_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[1]); } else { //temp = (jend-jst) / max_compute_units; temp = 1; rhsz_lws[0] = temp == 0 ? 1 : temp; rhsz_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), rhsz_lws[0]); } k_ssor2 = clCreateKernel(p_main, "ssor2", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor2"); ecode = clSetKernelArg(k_ssor2, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor2, 2, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor2, 3, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor2, 4, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR2_DIM == 3) { ssor2_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor2_lws[1]; ssor2_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[1]); ssor2_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[2]); } else if (SSOR2_DIM == 2) { ssor2_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor2_lws[0]; ssor2_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor2_lws[0]); ssor2_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor2_lws[0] = temp == 0 ? 1 : temp; ssor2_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor2_lws[0]); } k_ssor3 = clCreateKernel(p_main, "ssor3", &ecode); clu_CheckError(ecode, "clCreateKernel() for ssor3"); ecode = clSetKernelArg(k_ssor3, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_ssor3, 1, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_ssor3, 3, sizeof(int), &nx); ecode |= clSetKernelArg(k_ssor3, 4, sizeof(int), &ny); ecode |= clSetKernelArg(k_ssor3, 5, sizeof(int), &nz); clu_CheckError(ecode, "clSetKernelArg()"); if (SSOR3_DIM == 3) { ssor3_lws[0] = (iend-ist) < work_item_sizes[0] ? (iend-ist) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (jend-jst) < temp ? (jend-jst) : temp; temp = temp / ssor3_lws[1]; ssor3_lws[2] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(iend-ist), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[1]); ssor3_gws[2] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[2]); } else if (SSOR3_DIM == 2) { ssor3_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / ssor3_lws[0]; ssor3_lws[1] = (nz-2) < temp ? (nz-2) : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), ssor3_lws[0]); ssor3_gws[1] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[1]); } else { //temp = (nz-2) / max_compute_units; temp = 1; ssor3_lws[0] = temp == 0 ? 1 : temp; ssor3_gws[0] = clu_RoundWorkSize((size_t)(nz-2), ssor3_lws[0]); } k_blts = clCreateKernel(p_main, "blts", &ecode); clu_CheckError(ecode, "clCreateKernel() for blts"); ecode = clSetKernelArg(k_blts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_blts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_blts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_blts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_blts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_blts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_blts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); blts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / blts_lws[0]; blts_lws[1] = (nz-2) < temp ? (nz-2) : temp; blts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), blts_lws[0]); blts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), blts_lws[1]); k_buts = clCreateKernel(p_main, "buts", &ecode); clu_CheckError(ecode, "clCreateKernel() for buts"); ecode = clSetKernelArg(k_buts, 0, sizeof(cl_mem), &m_rsd); ecode |= clSetKernelArg(k_buts, 1, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_buts, 2, sizeof(cl_mem), &m_qs); ecode |= clSetKernelArg(k_buts, 3, sizeof(cl_mem), &m_rho_i); ecode |= clSetKernelArg(k_buts, 4, sizeof(int), &nz); ecode |= clSetKernelArg(k_buts, 5, sizeof(int), &ny); ecode |= clSetKernelArg(k_buts, 6, sizeof(int), &nx); clu_CheckError(ecode, "clSetKernelArg()"); buts_lws[0] = (jend-jst) < work_item_sizes[0] ? (jend-jst) : work_item_sizes[0]; temp = max_work_group_size / buts_lws[0]; buts_lws[1] = (nz-2) < temp ? (nz-2) : temp; buts_gws[0] = clu_RoundWorkSize((size_t)(jend-jst), buts_lws[0]); buts_gws[1] = clu_RoundWorkSize((size_t)(nz-2), buts_lws[1]); if (timeron) timer_stop(TIMER_OPENCL); }
/***** Task Led(Toggle) *****/ void ledTask() { static uint8 mode = 0; if (timer_isfired(ON_WTD_TIMER_ID)) { wdt_reset(); MDP_NRFSendDimmingReqToMDP(1, 1); timer_clear(ON_WTD_TIMER_ID); timer_set(ON_WTD_TIMER_ID, 100); } if (timer_isfired(ON_TEST_TIMER_ID)) { if (rotary_GetValue() == 0x00) { if (mode == 0) { mode++; lastSendDimmingLevel[0] = 0x34; lastSendDimmingLevel[1] = 0x34; lastSendDimmingLevel[2] = 0x34; lastSendDimmingLevel[3] = 0x34; MDP_SendDimmingReqToMDP(1, 1); // MDP_TESTSendDimmingReqToMDP(0xff, 0x33); } else if (mode == 1) { mode++; lastSendDimmingLevel[0] = 0x67; lastSendDimmingLevel[1] = 0x67; lastSendDimmingLevel[2] = 0x67; lastSendDimmingLevel[3] = 0x67; MDP_SendDimmingReqToMDP(1, 1); // MDP_TESTSendDimmingReqToMDP(0xff, 0x66); } else if (mode == 2) { mode++; lastSendDimmingLevel[0] = 0x9A; lastSendDimmingLevel[1] = 0x9A; lastSendDimmingLevel[2] = 0x9A; lastSendDimmingLevel[3] = 0x9A; MDP_SendDimmingReqToMDP(1, 1); // MDP_TESTSendDimmingReqToMDP(0xff, 0x99); } else if (mode == 3) { mode++; lastSendDimmingLevel[0] = 0xCD; lastSendDimmingLevel[1] = 0xCD; lastSendDimmingLevel[2] = 0xCD; lastSendDimmingLevel[3] = 0xCD; MDP_SendDimmingReqToMDP(1, 1); // MDP_TESTSendDimmingReqToMDP(0xff, 0xcc); } else if (mode == 4) { mode = 0; lastSendDimmingLevel[0] = 0xFE; lastSendDimmingLevel[1] = 0xFE; lastSendDimmingLevel[2] = 0xFE; lastSendDimmingLevel[3] = 0xFE; MDP_SendDimmingReqToMDP(1, 1); // MDP_TESTSendDimmingReqToMDP(0xff, 0xFE); } else { mode = 0; } // MDP_SendSetWatchdogReqToMDP(0); } else { MDP_SendDimmingReqToMDP(1, 1); } timer_clear(ON_TEST_TIMER_ID); timer_set(ON_TEST_TIMER_ID, 3000); } }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double *x, **xx, *q, **qq; double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; const int TRANSFER_X = 1; int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; double loc_x,loc_t1,loc_t2,loc_t3,loc_t4; double loc_a1,loc_a2,loc_x1,loc_x2,loc_z; boolean verified; char size[13+1]; /* character*13 */ /* Allocate working memory */ x = (double*) malloc(sizeof(double) * 2*NK); xx = (double**) malloc(sizeof(double*) * NN); xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK); for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK); q = (double*) malloc(sizeof(double) * NQ); qq = (double**) malloc(sizeof(double*) * NN); qq[0] = (double*) malloc(sizeof(double) * NN * NQ); for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ; /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ #pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \ copyout(q[0:NQ]) { vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); #pragma acc update device(x[0:2*NK]) /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; #pragma acc parallel loop for (k = 0; k < np; k++) { /* Initialize private q (qq) */ #pragma acc loop for (i = 0; i < NQ; i++) qq[k][i] = 0.0; /* Initialize private x (xx) */ #pragma acc loop for (i = 0; i < 2*NK; i++) xx[k][i] = x[i]; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double psx, psy; #pragma acc parallel loop reduction(+:sx,sy) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ #pragma acc loop seq for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = RANDLC(&t1, t2); if (ik == 0) break; t3 = RANDLC(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ loc_t1 = r23 * A; loc_a1 = (int)loc_t1; loc_a2 = A - t23 * loc_a1; loc_x = t1; #pragma acc loop seq for (i = 1; i <= 2*NK; i++) { loc_t1 = r23 * loc_x; loc_x1 = (int)loc_t1; loc_x2 = loc_x - t23 * loc_x1; loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1; loc_t2 = (int)(r23 * loc_t1); loc_z = loc_t1 - t23 * loc_t2; loc_t3 = t23 * loc_z + loc_a2 * loc_x2; loc_t4 = (int)(r46 * loc_t3); loc_x = loc_t3 - t46 * loc_t4; xx[k-1][i-1] = r46 * loc_x; } t1 = loc_x; /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ psx = psy = 0.0; #pragma acc loop reduction(+:psx,psy) for ( i = 0; i < NK; i++) { x1 = 2.0 * xx[k-1][2*i] - 1.0; x2 = 2.0 * xx[k-1][2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[k-1][l] += 1.0; /* counts */ psx = psx + t3; /* sum of Xi */ psy = psy + t4; /* sum of Yi */ } } sx += psx; sy += psy; } /* Reduce private qq to q */ #pragma acc parallel loop reduction(+:gc) for ( i = 0; i < NQ; i++ ) { double sumq = 0.0; #pragma acc loop reduction(+:sumq) for (k = 0; k < np; k++) sumq = sumq + qq[k][i]; q[i] = sumq; gc += sumq; } } /* end acc data */ timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); return 0; }
int main( int argc, char **argv ) { int i, iteration, timer_on; double timecounter; FILE *fp; /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* Printout initial NPB info */ printf ( "\n\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS ); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } return 0; /**************************/ } /* E N D P R O G R A M */
int main() { double Mops, t1, t2, t3, t4, x1, x2; double sx, sy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int np; int i, ik, kk, l, k, nit; int k_offset, j; logical verified, timers_enabled; double dum[3] = {1.0, 1.0, 1.0}; char size[16]; FILE *fp; if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; //-------------------------------------------------------------------- // Call the random number generator functions and initialize // the x-array to reduce the effects of paging on the timings. // Also, call all mathematical functions that are used. Make // sure these initializations cannot be eliminated as dead code. //-------------------------------------------------------------------- vranlc(0, &dum[0], dum[1], &dum[2]); dum[0] = randlc(&dum[1], dum[2]); for (i = 0; i < 2 * NK; i++) { x[i] = -1.0e99; } Mops = log(sqrt(fabs(MAX(1.0, 1.0)))); timer_clear(0); timer_clear(1); timer_clear(2); timer_start(0); t1 = A; vranlc(0, &t1, A, x); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; // Find starting seed t1 for this kk. for (i = 1; i <= 100; i++) { ik = kk / 2; if ((2 * ik) != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } //-------------------------------------------------------------------- // Compute uniform pseudorandom numbers. //-------------------------------------------------------------------- if (timers_enabled) timer_start(2); vranlc(2 * NK, &t1, A, x); if (timers_enabled) timer_stop(2); //-------------------------------------------------------------------- // Compute Gaussian deviates by acceptance-rejection method and // tally counts in concentri//square annuli. This loop is not // vectorizable. //-------------------------------------------------------------------- if (timers_enabled) timer_start(1); for (i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = x1 * x1 + x2 * x2; if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); t4 = (x2 * t2); l = MAX(fabs(t3), fabs(t4)); q[l] = q[l] + 1.0; sx = sx + t3; sy = sy + t4; } } if (timers_enabled) timer_stop(1); } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((sx - sx_verify_value) / sx_verify_value); sy_err = fabs((sy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", sx, sy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); tt = timer_read(1); printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); tt = timer_read(2); printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } return 0; }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char *argv[]) { size_t temp; cl_int ecode; char *source_dir = "FT"; if (argc > 1) source_dir = argv[1]; #ifdef TIMER_DETAIL if (timers_enabled) { int i; for (i = T_OPENCL_API; i < T_END; i++) timer_clear(i); } #endif DTIMER_START(T_OPENCL_API); // 1. Find the default device type and get a device for the device type device_type = clu_GetDefaultDeviceType(); device = clu_GetAvailableDevice(device_type); device_name = clu_GetDeviceName(device); // Device information ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_sizes), &work_item_sizes, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // FIXME: The below values are experimental. if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } ecode = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); clu_CheckError(ecode, "clGetDiviceInfo()"); // 2. Create a context for the specified device context = clCreateContext(NULL, 1, &device, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = clCreateCommandQueue(context, device, 0, &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); DTIMER_STOP(T_OPENCL_API); // 4. Build the program DTIMER_START(T_BUILD); char *source_file; char build_option[50]; if (device_type == CL_DEVICE_TYPE_CPU) { source_file = "ft_cpu.cl"; sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU", CLASS); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_CPU; EVOLVE_DIM = EVOLVE_DIM_CPU; CFFTS_DIM = CFFTS_DIM_CPU; } else if (device_type == CL_DEVICE_TYPE_GPU) { char vendor[50]; ecode = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 50, vendor, NULL); clu_CheckError(ecode, "clGetDeviceInfo()"); if (strncmp(vendor, DEV_VENDOR_NVIDIA, strlen(DEV_VENDOR_NVIDIA)) == 0) { source_file = "ft_gpu_nvidia.cl"; CFFTS_LSIZE = 32; } else { source_file = "ft_gpu.cl"; CFFTS_LSIZE = 64; } sprintf(build_option, "-I. -DCLASS=\'%c\' -DLSIZE=%lu", CLASS, CFFTS_LSIZE); COMPUTE_IMAP_DIM = COMPUTE_IMAP_DIM_GPU; EVOLVE_DIM = EVOLVE_DIM_GPU; CFFTS_DIM = CFFTS_DIM_GPU; } else { fprintf(stderr, "Set the environment variable OPENCL_DEVICE_TYPE!\n"); exit(EXIT_FAILURE); } program = clu_MakeProgram(context, device, source_dir, source_file, build_option); DTIMER_STOP(T_BUILD); // 5. Create buffers DTIMER_START(T_BUFFER_CREATE); m_u = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(dcomplex) * NXP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_u0 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u0"); m_u1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u1"); m_twiddle = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double) * NTOTALP, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_twiddle"); if (device_type == CL_DEVICE_TYPE_CPU) { size_t ty1_size, ty2_size; if (CFFTS_DIM == 2) { ty1_size = sizeof(dcomplex) * NX * NY * NZ; ty2_size = sizeof(dcomplex) * NX * NY * NZ; } else { fprintf(stderr, "Wrong CFFTS_DIM: %u\n", CFFTS_DIM); exit(EXIT_FAILURE); } m_ty1 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty1_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty1"); m_ty2 = clCreateBuffer(context, CL_MEM_READ_WRITE, ty2_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ty2"); } if (device_type == CL_DEVICE_TYPE_CPU) { temp = 1024 / max_compute_units; checksum_local_ws = temp == 0 ? 1 : temp; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } else if (device_type == CL_DEVICE_TYPE_GPU) { checksum_local_ws = 32; checksum_global_ws = clu_RoundWorkSize((size_t)1024, checksum_local_ws); } checksum_wg_num = checksum_global_ws / checksum_local_ws; m_chk = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(dcomplex) * checksum_wg_num, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_chk"); g_chk = (dcomplex *)malloc(sizeof(dcomplex) * checksum_wg_num); DTIMER_STOP(T_BUFFER_CREATE); // 6. Create kernels DTIMER_START(T_OPENCL_API); double ap = -4.0 * ALPHA * PI * PI; int d1 = dims[0]; int d2 = dims[1]; int d3 = dims[2]; k_compute_indexmap = clCreateKernel(program, "compute_indexmap", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_indexmap"); ecode = clSetKernelArg(k_compute_indexmap, 0, sizeof(cl_mem), &m_twiddle); ecode |= clSetKernelArg(k_compute_indexmap, 1, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_indexmap, 2, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_indexmap, 3, sizeof(int), &d3); ecode |= clSetKernelArg(k_compute_indexmap, 4, sizeof(double), &ap); clu_CheckError(ecode, "clSetKernelArg() for compute_indexmap"); if (COMPUTE_IMAP_DIM == 3) { cimap_lws[0] = d1 < work_item_sizes[0] ? d1 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d2 < temp ? d2 : temp; temp = temp / cimap_lws[1]; cimap_lws[2] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d1, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d2, cimap_lws[1]); cimap_gws[2] = clu_RoundWorkSize((size_t)d3, cimap_lws[2]); } else if (COMPUTE_IMAP_DIM == 2) { cimap_lws[0] = d2 < work_item_sizes[0] ? d2 : work_item_sizes[0]; temp = max_work_group_size / cimap_lws[0]; cimap_lws[1] = d3 < temp ? d3 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d2, cimap_lws[0]); cimap_gws[1] = clu_RoundWorkSize((size_t)d3, cimap_lws[1]); } else { //temp = d3 / max_compute_units; temp = 1; cimap_lws[0] = temp == 0 ? 1 : temp; cimap_gws[0] = clu_RoundWorkSize((size_t)d3, cimap_lws[0]); } k_compute_ics = clCreateKernel(program, "compute_initial_conditions", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_initial_conditions"); ecode = clSetKernelArg(k_compute_ics, 2, sizeof(int), &d1); ecode |= clSetKernelArg(k_compute_ics, 3, sizeof(int), &d2); ecode |= clSetKernelArg(k_compute_ics, 4, sizeof(int), &d3); clu_CheckError(ecode, "clSetKernelArg() for compute_initial_conditions"); k_cffts1 = clCreateKernel(program, "cffts1", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts1"); ecode = clSetKernelArg(k_cffts1, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts1, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts1, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts1"); k_cffts2 = clCreateKernel(program, "cffts2", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts2"); ecode = clSetKernelArg(k_cffts2, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts2, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts2, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts2"); k_cffts3 = clCreateKernel(program, "cffts3", &ecode); clu_CheckError(ecode, "clCreateKernel() for cffts3"); ecode = clSetKernelArg(k_cffts3, 2, sizeof(cl_mem), &m_u); if (device_type == CL_DEVICE_TYPE_CPU) { ecode |= clSetKernelArg(k_cffts3, 8, sizeof(cl_mem), &m_ty1); ecode |= clSetKernelArg(k_cffts3, 9, sizeof(cl_mem), &m_ty2); } clu_CheckError(ecode, "clSetKernelArg() for k_cffts3"); k_evolve = clCreateKernel(program, "evolve", &ecode); clu_CheckError(ecode, "clCreateKernel() for evolve"); k_checksum = clCreateKernel(program, "checksum", &ecode); clu_CheckError(ecode, "clCreateKernel() for checksum"); ecode = clSetKernelArg(k_checksum, 1, sizeof(cl_mem), &m_chk); ecode |= clSetKernelArg(k_checksum, 2, sizeof(dcomplex)*checksum_local_ws, NULL); ecode |= clSetKernelArg(k_checksum, 3, sizeof(int), &dims[0]); ecode |= clSetKernelArg(k_checksum, 4, sizeof(int), &dims[1]); clu_CheckError(ecode, "clSetKernelArg() for checksum"); DTIMER_STOP(T_OPENCL_API); }
int main(int argc, char** argv ) { int i, iteration, itemp; int nthreads = 1; double timecounter, maxtime; /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; }; /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - IS Benchmark\n\n" ); printf( " Size: %d (class %c)\n", TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); /* Initialize timer */ timer_clear( 0 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ #pragma omp parallel rank( 1 ); /* Start verification counter */ passed_verification = 0; if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ #pragma omp parallel private(iteration) for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { #pragma omp master if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ full_verify(); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, TOTAL_KEYS, 0, 0, MAX_ITERATIONS, nthreads, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "randlc2"); return 0; /**************************/ } /* E N D P R O G R A M */
int main( int argc, char **argv ) { MPI_Init(&argc,&argv); INT_TYPE chunk; int ini, fim; int i, j, iteration, timer_on; double timecounter; FILE *fp; int myrank; MPI_Status st; MPI_Comm_rank(MPI_COMM_WORLD,&myrank); MPI_Comm_size(MPI_COMM_WORLD,&NUM_THREADS); if (myrank == 0) { /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* Printout initial NPB info */ printf ( "\n\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Number of available threads: %d\n", NUM_THREADS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); } R23 = pow(2, -23); T23 = pow(2, 23); R46 = pow(2, -46); T46 = pow(2, 46); /* Generate random number sequence and subsequent keys on all procs */ create_seq(myrank); if (myrank == 0) { // sincronizar resultados for (i = 1; i < NUM_THREADS; i++) { chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS; ini = chunk * i; fim = ini + chunk; if ( fim > NUM_KEYS ) { fim = NUM_KEYS; } MPI_Recv( &aux_key_array[ini], (fim - ini), MPI_INT, i, 0, MPI_COMM_WORLD, &st ); for (j = ini; j < fim; j++) { key_array[j] = aux_key_array[j]; } } } else { chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS; ini = chunk * myrank; fim = ini + chunk; if ( fim > NUM_KEYS ) { fim = NUM_KEYS; } // enviar resultados MPI_Send( &key_array[ini], (fim - ini), MPI_INT, 0, 0, MPI_COMM_WORLD ); } if (myrank == 0) { if (timer_on) { timer_stop( 1 ); } /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS ); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } } MPI_Finalize(); return 0; /**************************/ } /* E N D P R O G R A M */
int main(int argc, char *argv[]) { int step, ie, iside, i, j, k; double mflops, tmax, nelt_tot = 0.0; char Class; logical ifmortar = false, verified; double t2, trecs[t_last+1]; char *t_names[t_last+1]; //-------------------------------------------------------------------- // Initialize NUMA control //-------------------------------------------------------------------- numa_initialize_env(NUMA_MIGRATE_EXISTING); //--------------------------------------------------------------------- // Read input file (if it exists), else take // defaults from parameters //--------------------------------------------------------------------- FILE *fp; if ((fp = fopen("timer.flag", "r")) != NULL) { timeron = true; t_names[t_total] = "total"; t_names[t_init] = "init"; t_names[t_convect] = "convect"; t_names[t_transfb_c] = "transfb_c"; t_names[t_diffusion] = "diffusion"; t_names[t_transf] = "transf"; t_names[t_transfb] = "transfb"; t_names[t_adaptation] = "adaptation"; t_names[t_transf2] = "transf+b"; t_names[t_add2] = "add2"; fclose(fp); } else { timeron = false; } printf("\n\n NAS Parallel Benchmarks (NPB3.3-OMP-C) - UA Benchmark\n\n"); if ((fp = fopen("inputua.data", "r")) != NULL) { int result; printf(" Reading from input file inputua.data\n"); result = fscanf(fp, "%d", &fre); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d", &niter); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d", &nmxh); while (fgetc(fp) != '\n'); result = fscanf(fp, "%lf", &alpha); Class = 'U'; fclose(fp); } else { printf(" No input file inputua.data. Using compiled defaults\n"); fre = FRE_DEFAULT; niter = NITER_DEFAULT; nmxh = NMXH_DEFAULT; alpha = ALPHA_DEFAULT; Class = CLASS_DEFAULT; } dlmin = pow(0.5, REFINE_MAX); dtime = 0.04*dlmin; printf(" Levels of refinement: %8d\n", REFINE_MAX); printf(" Adaptation frequency: %8d\n", fre); printf(" Time steps: %8d dt: %15.6E\n", niter, dtime); printf(" CG iterations: %8d\n", nmxh); printf(" Heat source radius: %8.4f\n", alpha); printf(" Number of available threads: %8d\n", omp_get_max_threads()); printf("\n"); top_constants(); for (i = 1; i <= t_last; i++) { timer_clear(i); } if (timeron) timer_start(t_init); // set up initial mesh (single element) and solution (all zero) create_initial_grid(); r_init_omp((double *)ta1, ntot, 0.0); nr_init_omp((int *)sje, 4*6*nelt, -1); init_locks(); // compute tables of coefficients and weights coef(); geom1(); // compute the discrete laplacian operators setdef(); // prepare for the preconditioner setpcmo_pre(); // refine initial mesh and do some preliminary work time = 0.0; mortar(); prepwork(); adaptation(&ifmortar, 0); if (timeron) timer_stop(t_init); timer_clear(1); time = 0.0; for (step = 0; step <= niter; step++) { if (step == 1) { // reset the solution and start the timer, keep track of total no elms r_init((double *)ta1, ntot, 0.0); time = 0.0; nelt_tot = 0.0; for (i = 1; i <= t_last; i++) { if (i != t_init) timer_clear(i); } timer_start(1); } // advance the convection step convect(ifmortar); if (timeron) timer_start(t_transf2); // prepare the intital guess for cg transf(tmort, (double *)ta1); // compute residual for diffusion term based on intital guess // compute the left hand side of equation, lapacian t #pragma omp parallel default(shared) private(ie,k,j,i) { #pragma omp for for (ie = 0; ie < nelt; ie++) { laplacian(ta2[ie], ta1[ie], size_e[ie]); } // compute the residual #pragma omp for for (ie = 0; ie < nelt; ie++) { for (k = 0; k < LX1; k++) { for (j = 0; j < LX1; j++) { for (i = 0; i < LX1; i++) { trhs[ie][k][j][i] = trhs[ie][k][j][i] - ta2[ie][k][j][i]; } } } } } //end parallel // get the residual on mortar transfb(rmor, (double *)trhs); if (timeron) timer_stop(t_transf2); // apply boundary condition: zero out the residual on domain boundaries // apply boundary conidtion to trhs #pragma omp parallel for default(shared) private(ie,iside) for (ie = 0; ie < nelt; ie++) { for (iside = 0; iside < NSIDES; iside++) { if (cbc[ie][iside] == 0) { facev(trhs[ie], iside, 0.0); } } } // apply boundary condition to rmor col2(rmor, tmmor, nmor); // call the conjugate gradient iterative solver diffusion(ifmortar); // add convection and diffusion if (timeron) timer_start(t_add2); add2((double *)ta1, (double *)t, ntot); if (timeron) timer_stop(t_add2); // perform mesh adaptation time = time + dtime; if ((step != 0) && (step/fre*fre == step)) { if (step != niter) { adaptation(&ifmortar, step); } } else { ifmortar = false; } nelt_tot = nelt_tot + (double)(nelt); } timer_stop(1); tmax = timer_read(1); verify(&Class, &verified); // compute millions of collocation points advanced per second. // diffusion: nmxh advancements, convection: 1 advancement mflops = nelt_tot*(double)(LX1*LX1*LX1*(nmxh+1))/(tmax*1.e6); print_results("UA", Class, REFINE_MAX, 0, 0, niter, tmax, mflops, " coll. point advanced", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, "(none)"); //--------------------------------------------------------------------- // More timers //--------------------------------------------------------------------- if (timeron) { for (i = 1; i <= t_last; i++) { trecs[i] = timer_read(i); } if (tmax == 0.0) tmax = 1.0; printf(" SECTION Time (secs)\n"); for (i = 1; i <= t_last; i++) { printf(" %-10s:%9.3f (%6.2f%%)\n", t_names[i], trecs[i], trecs[i]*100./tmax); if (i == t_transfb_c) { t2 = trecs[t_convect] - trecs[t_transfb_c]; printf(" --> %11s:%9.3f (%6.2f%%)\n", "sub-convect", t2, t2*100./tmax); } else if (i == t_transfb) { t2 = trecs[t_diffusion] - trecs[t_transf] - trecs[t_transfb]; printf(" --> %11s:%9.3f (%6.2f%%)\n", "sub-diffuse", t2, t2*100./tmax); } } } //-------------------------------------------------------------------- // Teardown NUMA control //-------------------------------------------------------------------- numa_shutdown(); return 0; }
//--------------------------------------------------------------------- // Set up the OpenCL environment. //--------------------------------------------------------------------- static void setup_opencl(int argc, char **argv) { int i, c; // size_t temp; cl_int ecode = 0; char *source_dir = "."; //FIXME int num_subs = DEFAULT_NUM_SUBS; int num_cus; int sqrt_num_command_queues; if (argc > 1) source_dir = argv[1]; devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_subs); if (timeron) { timer_clear(TIMER_OPENCL); timer_clear(TIMER_BUILD); timer_clear(TIMER_BUFFER); timer_clear(TIMER_RELEASE); timer_start(TIMER_OPENCL); } // 1. Find the default device type and get a device for the device type // Then, create sub-devices from the parent device. //device_type = CL_DEVICE_TYPE_CPU; device_type = CL_DEVICE_TYPE_ALL; //device_type = CL_DEVICE_TYPE_GPU; if(argc <= 2) { printf("Device type argument missing!\n"); exit(-1); } char *device_type_str = argv[2]; if(strcmp(device_type_str, "CPU") == 0 || strcmp(device_type_str, "cpu") == 0) { device_type = CL_DEVICE_TYPE_CPU; } else if(strcmp(device_type_str, "GPU") == 0 || strcmp(device_type_str, "gpu") == 0) { device_type = CL_DEVICE_TYPE_GPU; } else if(strcmp(device_type_str, "ALL") == 0 || strcmp(device_type_str, "all") == 0) { device_type = CL_DEVICE_TYPE_ALL; } else { printf("Unsupported device type!\n"); exit(-1); } cl_uint num_command_queues = 4; char *num_command_queues_str = getenv("SNU_NPB_COMMAND_QUEUES"); if(num_command_queues_str != NULL) num_command_queues = atoi(num_command_queues_str); cl_platform_id platform; ecode = clGetPlatformIDs(1, &platform, NULL); clu_CheckError(ecode, "clGetPlatformIDs()"); ecode = clGetDeviceIDs(platform, device_type, 0, NULL, &num_devices); clu_CheckError(ecode, "clGetDeviceIDs()"); //num_devices = 2; ecode = clGetDeviceIDs(platform, device_type, num_devices, devices, NULL); clu_CheckError(ecode, "clGetDeviceIDs()"); cl_device_id tmp_dev; work_item_sizes[0] = work_item_sizes[1] = work_item_sizes[2] = 1024; max_work_group_size = 1024; max_compute_units = 22; sqrt_num_command_queues = (int)(sqrt((double)(num_command_queues) + 0.00001)); if (num_command_queues != sqrt_num_command_queues * sqrt_num_command_queues) { fprintf(stderr, "Number of devices is not a square of some integer\n"); exit(EXIT_FAILURE); } ncells = (int)(sqrt((double)(num_command_queues) + 0.00001)); MAX_CELL_DIM = ((PROBLEM_SIZE/ncells)+1); IMAX = MAX_CELL_DIM; JMAX = MAX_CELL_DIM; KMAX = MAX_CELL_DIM; IMAXP = (IMAX/2*2+1); JMAXP = (JMAX/2*2+1); //--------------------------------------------------------------------- // +1 at end to avoid zero length arrays for 1 node //--------------------------------------------------------------------- BUF_SIZE = (MAX_CELL_DIM*MAX_CELL_DIM*(MAXCELLS-1)*60*2+1); // FIXME if (max_work_group_size > 64) { max_work_group_size = 64; int i; for (i = 0; i < 3; i++) { if (work_item_sizes[i] > 64) { work_item_sizes[i] = 64; } } } // 2. Create a context for devices #ifdef MINIMD_SNUCL_OPTIMIZATIONS cl_context_properties props[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_SCHEDULER, CL_CONTEXT_SCHEDULER_CODE_SEGMENTED_PERF_MODEL, //CL_CONTEXT_SCHEDULER_PERF_MODEL, //CL_CONTEXT_SCHEDULER_FIRST_EPOCH_BASED_PERF_MODEL, //CL_CONTEXT_SCHEDULER_ALL_EPOCH_BASED_PERF_MODEL, 0 }; context = clCreateContext(props, #elif defined(SOCL_OPTIMIZATIONS) cl_context_properties props[5] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, CL_CONTEXT_SCHEDULER_SOCL, "dmda", //"random", 0 }; context = clCreateContext(props, #else context = clCreateContext(NULL, #endif num_devices, devices, NULL, NULL, &ecode); clu_CheckError(ecode, "clCreateContext()"); // 3. Create a command queue cmd_queue = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_command_queues*3); for (i = 0; i < num_command_queues * 2; i++) { //cmd_queue[i] = clCreateCommandQueue(context, devices[(i / 2) % num_devices], #ifdef SOCL_OPTIMIZATIONS cmd_queue[i] = clCreateCommandQueue(context, NULL, #else cmd_queue[i] = clCreateCommandQueue(context, devices[num_devices - 1 - ((i / 2) % num_devices)], #endif // cmd_queue[i] = clCreateCommandQueue(context, devices[0], #ifdef MINIMD_SNUCL_OPTIMIZATIONS 0, // CL_QUEUE_AUTO_DEVICE_SELECTION | // CL_QUEUE_ITERATIVE, //CL_QUEUE_COMPUTE_INTENSIVE, #else 0, #endif &ecode); clu_CheckError(ecode, "clCreateCommandQueue()"); } // 4. Build the program if (timeron) timer_start(TIMER_BUILD); char *source_file = "sp_kernel.cl"; //p_program = clu_MakeProgram(context, devices, source_dir, source_file, build_option); p_program = clu_CreateProgram(context, source_dir, source_file); for(i = 0; i < num_devices; i++) { char build_option[200] = {0}; cl_device_type cur_device_type; cl_int err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &cur_device_type, NULL); clu_CheckError(err, "clGetDeviceInfo()"); if (cur_device_type == CL_DEVICE_TYPE_CPU) { sprintf(build_option, "-I. -DCLASS=%d -DUSE_CPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP); } else { sprintf(build_option, "-I. -DCLASS=%d -DUSE_GPU -DMAX_CELL_DIM=%d -DIMAX=%d -DJMAX=%d -DKMAX=%d -DIMAXP=%d -DJMAXP=%d", CLASS, MAX_CELL_DIM, IMAX, JMAX, KMAX, IMAXP, JMAXP); } clu_MakeProgram(p_program, 1, &devices[i], source_dir, build_option); //clu_MakeProgram(p_program, num_devices, devices, source_dir, build_option); } num_devices = num_command_queues; program = (cl_program *)malloc(sizeof(cl_program) * num_devices); for (i = 0; i < num_devices; i++) { program[i] = p_program; } if (timeron) timer_stop(TIMER_BUILD); // 5. Create kernels size_t asize = sizeof(cl_kernel) * num_devices; k_initialize1 = (cl_kernel *)malloc(asize); k_initialize2 = (cl_kernel *)malloc(asize); k_initialize3 = (cl_kernel *)malloc(asize); k_initialize4 = (cl_kernel *)malloc(asize); k_initialize5 = (cl_kernel *)malloc(asize); k_initialize6 = (cl_kernel *)malloc(asize); k_initialize7 = (cl_kernel *)malloc(asize); k_initialize8 = (cl_kernel *)malloc(asize); k_lhsinit = (cl_kernel *)malloc(asize); k_exact_rhs1 = (cl_kernel *)malloc(asize); k_exact_rhs2 = (cl_kernel *)malloc(asize); k_exact_rhs3 = (cl_kernel *)malloc(asize); k_exact_rhs4 = (cl_kernel *)malloc(asize); k_exact_rhs5 = (cl_kernel *)malloc(asize); k_copy_faces1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_copy_faces6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_compute_rhs6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_txinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsx = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_ninvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_x_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsy = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_pinvr = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_y_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_lhsz = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_tzetar = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve1 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve2 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve3 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve4 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve5 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_z_solve6 = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_add = (cl_kernel (*)[MAXCELLS])malloc(asize*MAXCELLS); k_error_norm = (cl_kernel *)malloc(asize); k_rhs_norm = (cl_kernel *)malloc(asize); for (i = 0; i < num_devices; i++) { k_initialize1[i] = clCreateKernel(program[i], "initialize1", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize1"); k_initialize2[i] = clCreateKernel(program[i], "initialize2", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize2"); k_initialize3[i] = clCreateKernel(program[i], "initialize3", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize3"); k_initialize4[i] = clCreateKernel(program[i], "initialize4", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize4"); k_initialize5[i] = clCreateKernel(program[i], "initialize5", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize5"); k_initialize6[i] = clCreateKernel(program[i], "initialize6", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize6"); k_initialize7[i] = clCreateKernel(program[i], "initialize7", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize7"); k_initialize8[i] = clCreateKernel(program[i], "initialize8", &ecode); clu_CheckError(ecode, "clCreateKernel() for initialize8"); k_lhsinit[i] = clCreateKernel(program[i], "lhsinit", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsinit"); k_exact_rhs1[i] = clCreateKernel(program[i], "exact_rhs1", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs1"); k_exact_rhs2[i] = clCreateKernel(program[i], "exact_rhs2", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs2"); k_exact_rhs3[i] = clCreateKernel(program[i], "exact_rhs3", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs3"); k_exact_rhs4[i] = clCreateKernel(program[i], "exact_rhs4", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs4"); k_exact_rhs5[i] = clCreateKernel(program[i], "exact_rhs5", &ecode); clu_CheckError(ecode, "clCreateKernel() for exact_rhs5"); for (c = 0; c < MAXCELLS; c++) { k_copy_faces1[i][c] = clCreateKernel(program[i], "copy_faces1", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces1"); k_copy_faces2[i][c] = clCreateKernel(program[i], "copy_faces2", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces2"); k_copy_faces3[i][c] = clCreateKernel(program[i], "copy_faces3", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces3"); k_copy_faces4[i][c] = clCreateKernel(program[i], "copy_faces4", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces4"); k_copy_faces5[i][c] = clCreateKernel(program[i], "copy_faces5", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces5"); k_copy_faces6[i][c] = clCreateKernel(program[i], "copy_faces6", &ecode); clu_CheckError(ecode, "clCreateKernel() for copy_faces6"); k_compute_rhs1[i][c] = clCreateKernel(program[i], "compute_rhs1", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs1"); k_compute_rhs2[i][c] = clCreateKernel(program[i], "compute_rhs2", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs2"); k_compute_rhs3[i][c] = clCreateKernel(program[i], "compute_rhs3", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs3"); k_compute_rhs4[i][c] = clCreateKernel(program[i], "compute_rhs4", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs4"); k_compute_rhs5[i][c] = clCreateKernel(program[i], "compute_rhs5", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs5"); k_compute_rhs6[i][c] = clCreateKernel(program[i], "compute_rhs6", &ecode); clu_CheckError(ecode, "clCreateKernel() for compute_rhs6"); k_txinvr[i][c] = clCreateKernel(program[i], "txinvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for txinvr"); k_lhsx[i][c] = clCreateKernel(program[i], "lhsx", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsx"); k_ninvr[i][c] = clCreateKernel(program[i], "ninvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for ninvr"); k_x_solve1[i][c] = clCreateKernel(program[i], "x_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve1"); k_x_solve2[i][c] = clCreateKernel(program[i], "x_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve2"); k_x_solve3[i][c] = clCreateKernel(program[i], "x_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve3"); k_x_solve4[i][c] = clCreateKernel(program[i], "x_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve4"); k_x_solve5[i][c] = clCreateKernel(program[i], "x_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve5"); k_x_solve6[i][c] = clCreateKernel(program[i], "x_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for x_solve6"); k_lhsy[i][c] = clCreateKernel(program[i], "lhsy", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsy"); k_pinvr[i][c] = clCreateKernel(program[i], "pinvr", &ecode); clu_CheckError(ecode, "clCreateKernel() for pinvr"); k_y_solve1[i][c] = clCreateKernel(program[i], "y_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve1"); k_y_solve2[i][c] = clCreateKernel(program[i], "y_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve2"); k_y_solve3[i][c] = clCreateKernel(program[i], "y_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve3"); k_y_solve4[i][c] = clCreateKernel(program[i], "y_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve4"); k_y_solve5[i][c] = clCreateKernel(program[i], "y_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve5"); k_y_solve6[i][c] = clCreateKernel(program[i], "y_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for y_solve6"); k_lhsz[i][c] = clCreateKernel(program[i], "lhsz", &ecode); clu_CheckError(ecode, "clCreateKernel() for lhsz"); k_tzetar[i][c] = clCreateKernel(program[i], "tzetar", &ecode); clu_CheckError(ecode, "clCreateKernel() for tzetar"); k_z_solve1[i][c] = clCreateKernel(program[i], "z_solve1", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve1"); k_z_solve2[i][c] = clCreateKernel(program[i], "z_solve2", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve2"); k_z_solve3[i][c] = clCreateKernel(program[i], "z_solve3", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve3"); k_z_solve4[i][c] = clCreateKernel(program[i], "z_solve4", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve4"); k_z_solve5[i][c] = clCreateKernel(program[i], "z_solve5", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve5"); k_z_solve6[i][c] = clCreateKernel(program[i], "z_solve6", &ecode); clu_CheckError(ecode, "clCreateKernel() for z_solve6"); k_add[i][c] = clCreateKernel(program[i], "add", &ecode); clu_CheckError(ecode, "clCreateKernel() for add"); } k_error_norm[i] = clCreateKernel(program[i], "error_norm", &ecode); clu_CheckError(ecode, "clCreateKernel() for error_norm"); k_rhs_norm[i] = clCreateKernel(program[i], "rhs_norm", &ecode); clu_CheckError(ecode, "clCreateKernel() for rhs_norm"); } // 6. Create buffers if (timeron) timer_start(TIMER_BUFFER); asize = sizeof(cl_mem) * num_devices; m_u = (cl_mem *)malloc(asize); m_us = (cl_mem *)malloc(asize); m_vs = (cl_mem *)malloc(asize); m_ws = (cl_mem *)malloc(asize); m_qs = (cl_mem *)malloc(asize); m_ainv = (cl_mem *)malloc(asize); m_rho_i = (cl_mem *)malloc(asize); m_speed = (cl_mem *)malloc(asize); m_square = (cl_mem *)malloc(asize); m_rhs = (cl_mem *)malloc(asize); m_forcing = (cl_mem *)malloc(asize); m_lhs = (cl_mem *)malloc(asize); m_in_buffer = (cl_mem *)malloc(asize); m_out_buffer = (cl_mem *)malloc(asize); m_ce = (cl_mem *)malloc(asize); for (i = 0; i < num_devices; i++) { m_u[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+4)*(JMAXP+4)*(IMAXP+4)*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_u"); m_us[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_us"); m_vs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_vs"); m_ws[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ws"); m_qs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_qs"); m_ainv[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ainv"); m_rho_i[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rho_i"); m_speed[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_speed"); m_square[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*(KMAX+2)*(JMAX+2)*(IMAX+2), NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_square"); m_rhs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_rhs"); m_forcing[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*5, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_forcing"); m_lhs[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*MAXCELLS*KMAX*JMAXP*IMAXP*15, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_lhs"); m_in_buffer[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*BUF_SIZE, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_in_buffer"); m_out_buffer[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double)*BUF_SIZE, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_out_buffer"); m_ce[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*5*13, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer() for m_ce"); } if (timeron) timer_stop(TIMER_BUFFER); if (timeron) timer_stop(TIMER_OPENCL); }
int main(int argc, char **argv) { int i, j, k, it; int nthreads = 1; double zeta; double rnorm; double norm_temp11; double norm_temp12; double t, mflops; char cclass; boolean verified; double zeta_verify_value, epsilon; firstrow = 1; lastrow = NA; firstcol = 1; lastcol = NA; if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { cclass = 'S'; zeta_verify_value = 8.5971775078648; } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { cclass = 'W'; zeta_verify_value = 10.362595087124; } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { cclass = 'A'; zeta_verify_value = 17.130235054029; } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { cclass = 'B'; zeta_verify_value = 22.712745482631; } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { cclass = 'C'; zeta_verify_value = 28.973605592845; } else { cclass = 'U'; } printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - CG Benchmark\n"); printf(" Size: %10d\n", NA); printf(" Iterations: %5d\n", NITER); naa = NA; nzz = NZ; /*-------------------------------------------------------------------- c Initialize random number generator c-------------------------------------------------------------------*/ tran = 314159265.0; amult = 1220703125.0; zeta = randlc( &tran, amult ); /*-------------------------------------------------------------------- c c-------------------------------------------------------------------*/ makea(naa, nzz, a, colidx, rowstr, NONZER, firstrow, lastrow, firstcol, lastcol, RCOND, arow, acol, aelt, v, iv, SHIFT); /*--------------------------------------------------------------------- c Note: as a result of the above call to makea: c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 c values of colidx which are col indexes go from firstcol --> lastcol c So: c Shift the col index vals from actual (firstcol --> lastcol ) c to local, i.e., (1 --> lastcol-firstcol+1) c---------------------------------------------------------------------*/ #pragma omp parallel private(it,i,j,k) { #pragma omp for nowait for (j = 1; j <= lastrow - firstrow + 1; j++) { for (k = rowstr[j]; k < rowstr[j+1]; k++) { colidx[k] = colidx[k] - firstcol + 1; } } /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ #pragma omp for nowait for (i = 1; i <= NA+1; i++) { x[i] = 1.0; } #pragma omp single zeta = 0.0; /*------------------------------------------------------------------- c----> c Do one iteration untimed to init all code and data page tables c----> (then reinit, start timing, to niter its) c-------------------------------------------------------------------*/ for (it = 1; it <= 1; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ #pragma omp single { norm_temp11 = 0.0; norm_temp12 = 0.0; } /* end single */ #pragma omp for reduction(+:norm_temp11,norm_temp12) for (j = 1; j <= lastcol-firstcol+1; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } #pragma omp single norm_temp12 = 1.0 / sqrt( norm_temp12 ); /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ #pragma omp for for (j = 1; j <= lastcol-firstcol+1; j++) { x[j] = norm_temp12*z[j]; } } /* end of do one iteration untimed */ /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ #pragma omp for nowait for (i = 1; i <= NA+1; i++) { x[i] = 1.0; } #pragma omp single zeta = 0.0; } /* end parallel */ timer_clear( 1 ); timer_start( 1 ); /*-------------------------------------------------------------------- c----> c Main Iteration for inverse power method c----> c-------------------------------------------------------------------*/ #pragma omp parallel private(it,i,j,k) { for (it = 1; it <= NITER; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ #pragma omp single { norm_temp11 = 0.0; norm_temp12 = 0.0; } /* end single */ #pragma omp for reduction(+:norm_temp11,norm_temp12) for (j = 1; j <= lastcol-firstcol+1; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } #pragma omp single { norm_temp12 = 1.0 / sqrt( norm_temp12 ); zeta = SHIFT + 1.0 / norm_temp11; } /* end single */ #pragma omp master { if( it == 1 ) { printf(" iteration ||r|| zeta\n"); } printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); } /* end master */ /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ #pragma omp for for (j = 1; j <= lastcol-firstcol+1; j++) { x[j] = norm_temp12*z[j]; } } /* end of main iter inv pow meth */ #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop( 1 ); /*-------------------------------------------------------------------- c End of timed section c-------------------------------------------------------------------*/ t = timer_read( 1 ); printf(" Benchmark completed\n"); epsilon = 1.0e-10; if (cclass != 'U') { if (fabs(zeta - zeta_verify_value) <= epsilon) { verified = TRUE; printf(" VERIFICATION SUCCESSFUL\n"); printf(" Zeta is %20.12e\n", zeta); printf(" Error is %20.12e\n", zeta - zeta_verify_value); } else { verified = FALSE; printf(" VERIFICATION FAILED\n"); printf(" Zeta %20.12e\n", zeta); printf(" The correct zeta is %20.12e\n", zeta_verify_value); } } else { verified = FALSE; printf(" Problem size unknown\n"); printf(" NO VERIFICATION PERFORMED\n"); } if ( t != 0.0 ) { mflops = (2.0*NITER*NA) * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 ) / t / 1000000.0; } else { mflops = 0.0; } c_print_results("CG", cclass, NA, 0, 0, NITER, nthreads, t, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); }