int main(int argc, char *argv[]) { int i; int iter; double total_time, mflops; logical verified; char Class; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } setup(); setup_opencl(argc, argv); init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); fft_init(dims[0]); fft(1, &m_u1, &m_u0); //--------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } timer_start(T_total); if (timers_enabled) timer_start(T_setup); DTIMER_START(T_compute_im); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_im); DTIMER_START(T_compute_ics); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_ics); DTIMER_START(T_fft_init); fft_init(dims[0]); DTIMER_STOP(T_fft_init); if (timers_enabled) timer_stop(T_setup); if (timers_enabled) timer_start(T_fft); fft(1, &m_u1, &m_u0); if (timers_enabled) timer_stop(T_fft); for (iter = 1; iter <= niter; iter++) { if (timers_enabled) timer_start(T_evolve); evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_evolve); if (timers_enabled) timer_start(T_fft); fft(-1, &m_u1, &m_u1); if (timers_enabled) timer_stop(T_fft); if (timers_enabled) timer_start(T_checksum); checksum(iter, &m_u1, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_checksum); } verify(NX, NY, NZ, niter, &verified, &Class); timer_stop(T_total); total_time = timer_read(T_total); if (total_time != 0.0) { mflops = 1.0e-6 * (double)NTOTAL * (14.8157 + 7.19641 * log((double)NTOTAL) + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter) / total_time; } else { mflops = 0.0; } c_print_results("FT", Class, NX, NY, NZ, niter, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) print_timers(); release_opencl(); fflush(stdout); return 0; }
int main(int argc, char **argv) #endif { int i, niter, step; double mflops, t, tmax; logical verified; char class; double tsum[t_last+2], t1[t_last+2], tming[t_last+2], tmaxg[t_last+2]; char *t_recs[t_last+2] = { "total", "rhs", "xsolve", "ysolve", "zsolve", "bpack", "exch", "xcomm", "ycomm", "zcomm", " totcomp", " totcomm" }; //--------------------------------------------------------------------- // Root node reads input file (if it exists) else takes // defaults from parameters //--------------------------------------------------------------------- printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL-MD) - SP Benchmark\n\n"); FILE *fp; fp = fopen("timer.flag", "r"); timeron = false; if (fp != NULL) { timeron = true; fclose(fp); } if ((fp = fopen("inputsp.data", "r")) != NULL) { int result; printf(" Reading from input file inputsp.data\n"); result = fscanf(fp, "%d", &niter); while (fgetc(fp) != '\n'); result = fscanf(fp, "%*f"); while (fgetc(fp) != '\n'); result = fscanf(fp, "%d%d%d", &grid_points[0], &grid_points[1], &grid_points[2]); fclose(fp); } else { printf(" No input file inputsp.data. Using compiled defaults\n"); niter = NITER_DEFAULT; grid_points[0] = PROBLEM_SIZE; grid_points[1] = PROBLEM_SIZE; grid_points[2] = PROBLEM_SIZE; } setup_opencl(argc, argv); printf(" Size: %4dx%4dx%4d\n", grid_points[0], grid_points[1], grid_points[2]); printf(" Iterations: %4d", niter); if (num_devices != MAXCELLS*MAXCELLS) printf(" WARNING: compiled for %5d devices \n", MAXCELLS*MAXCELLS); printf(" Number of active devices: %5d\n\n", num_devices); make_set(); for (i = 0; i < t_last; i++) { timer_clear(i); } set_constants(); initialize(); lhsinit(); exact_rhs(); compute_buffer_size(5); set_kernel_args(); //--------------------------------------------------------------------- // do one time step to touch all code, and reinitialize //--------------------------------------------------------------------- #ifdef MINIMD_SNUCL_OPTIMIZATIONS // set cmd queue property for(i = 0; i < num_devices; i++) { clSetCommandQueueProperty(cmd_queue[i], CL_QUEUE_AUTO_DEVICE_SELECTION | //CL_QUEUE_ITERATIVE | CL_QUEUE_COMPUTE_INTENSIVE, true, NULL); } #endif adi(); #ifdef MINIMD_SNUCL_OPTIMIZATIONS for(i = 0; i < num_devices; i++) { clSetCommandQueueProperty(cmd_queue[i], 0, true, NULL); } #endif initialize(); //--------------------------------------------------------------------- // Synchronize before placing time stamp //--------------------------------------------------------------------- for (i = 0; i < t_last; i++) { timer_clear(i); } timer_clear(0); timer_start(0); for (step = 1; step <= niter; step++) { if ((step % 20) == 0 || step == 1) { printf(" Time step %4d\n", step); } adi(); } timer_stop(0); t = timer_read(0); verify(niter, &class, &verified); tmax = t; if( tmax != 0.0 ) { mflops = (881.174*(double)( PROBLEM_SIZE*PROBLEM_SIZE*PROBLEM_SIZE ) -4683.91*(double)( PROBLEM_SIZE*PROBLEM_SIZE ) +11484.5*(double)( PROBLEM_SIZE ) -19272.4) * (double)( niter ) / (tmax*1000000.0); } else { mflops = 0.0; } c_print_results("SP", class, grid_points[0], grid_points[1], grid_points[2], niter, tmax, mflops, " floating point", verified, NPBVERSION,COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name, num_devices); if (timeron) { /* for (i = 0; i < t_last; i++) { t1[i] = timer_read(i); } t1[t_xsolve] = t1[t_xsolve] - t1[t_xcomm]; t1[t_ysolve] = t1[t_ysolve] - t1[t_ycomm]; t1[t_zsolve] = t1[t_zsolve] - t1[t_zcomm]; t1[t_last+2] = t1[t_xcomm]+t1[t_ycomm]+t1[t_zcomm]+t1[t_exch]; t1[t_last+1] = t1[t_total] - t1[t_last+2]; MPI_Reduce(&t1, tsum, t_last+2, dp_type, MPI_SUM, 0, comm_setup); MPI_Reduce(&t1, tming, t_last+2, dp_type, MPI_MIN, 0, comm_setup); MPI_Reduce(&t1, tmaxg, t_last+2, dp_type, MPI_MAX, 0, comm_setup); if (node == 0) { printf(" nprocs =%6d minimum maximum average\n", total_nodes); for (i = 0; i < t_last+2; i++) { tsum[i] = tsum[i] / total_nodes; printf(" timer %2d(%8s) : %10.4f %10.4f %10.4f\n", i+1, t_recs[i], tming[i], tmaxg[i], tsum[i]); } } */ } release_opencl(); return 0; }
int main( int argc, char **argv ) { int i, iteration; double timecounter; FILE *fp; cl_int ecode; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* set up the OpenCL environment. */ setup_opencl(argc, argv); /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; DTIMER_START(T_BUFFER_WRITE); ecode = clEnqueueWriteBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_WRITE); if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_READ); /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "", clu_GetDeviceTypeName(device_type), device_name); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } release_opencl(); fflush(stdout); return 0; /**************************/ } /* E N D P R O G R A M */
int main(int argc, char *argv[]) { char Class; logical verified; double mflops; double t, tmax, trecs[t_last+1]; int i; char *t_names[t_last+1]; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Setup info for timers //--------------------------------------------------------------------- FILE *fp; if ((fp = fopen("timer.flag", "r")) != NULL) { timeron = true; t_names[t_total] = "total"; t_names[t_rhsx] = "rhsx"; t_names[t_rhsy] = "rhsy"; t_names[t_rhsz] = "rhsz"; t_names[t_rhs] = "rhs"; t_names[t_jacld] = "jacld"; t_names[t_blts] = "blts"; t_names[t_jacu] = "jacu"; t_names[t_buts] = "buts"; t_names[t_add] = "add"; t_names[t_l2norm] = "l2norm"; t_names[t_setbv] = "setbv"; t_names[t_setiv] = "setiv"; t_names[t_erhs] = "erhs"; t_names[t_error] = "error"; t_names[t_pintgr] = "pintgr"; t_names[t_blts1] = "blts1"; t_names[t_buts1] = "buts1"; fclose(fp); } else { timeron = false; } //--------------------------------------------------------------------- // read input data //--------------------------------------------------------------------- read_input(); //--------------------------------------------------------------------- // set up domain sizes //--------------------------------------------------------------------- domain(); //--------------------------------------------------------------------- // set up OpenCL environment //--------------------------------------------------------------------- setup_opencl(argc, argv); //--------------------------------------------------------------------- // set up coefficients //--------------------------------------------------------------------- setcoeff(); //--------------------------------------------------------------------- // set the boundary values for dependent variables //--------------------------------------------------------------------- setbv(); //--------------------------------------------------------------------- // set the initial values for dependent variables //--------------------------------------------------------------------- setiv(); //--------------------------------------------------------------------- // compute the forcing term based on prescribed exact solution //--------------------------------------------------------------------- erhs(); //--------------------------------------------------------------------- // perform one SSOR iteration to touch all data pages //--------------------------------------------------------------------- ssor(1); //--------------------------------------------------------------------- // reset the boundary and initial values //--------------------------------------------------------------------- setbv(); setiv(); //--------------------------------------------------------------------- // perform the SSOR iterations //--------------------------------------------------------------------- ssor(itmax); //--------------------------------------------------------------------- // compute the solution error //--------------------------------------------------------------------- error(); //--------------------------------------------------------------------- // compute the surface integral //--------------------------------------------------------------------- pintgr(); //--------------------------------------------------------------------- // verification test //--------------------------------------------------------------------- verify ( rsdnm, errnm, frc, &Class, &verified ); mflops = (double)itmax * (1984.77 * (double)nx0 * (double)ny0 * (double)nz0 - 10923.3 * pow(((double)(nx0+ny0+nz0)/3.0), 2.0) + 27770.9 * (double)(nx0+ny0+nz0)/3.0 - 144010.0) / (maxtime*1000000.0); c_print_results("LU", Class, nx0, ny0, nz0, itmax, maxtime, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, "(none)", clu_GetDeviceTypeName(device_type), device_name); //--------------------------------------------------------------------- // More timers //--------------------------------------------------------------------- if (timeron) { for (i = 1; i <= t_last; i++) { trecs[i] = timer_read(i); } tmax = maxtime; if (tmax == 0.0) tmax = 1.0; printf(" SECTION Time (secs)\n"); for (i = 1; i <= t_last; i++) { printf(" %-8s:%9.4f (%6.2f%%)\n", t_names[i], trecs[i], trecs[i]*100./tmax); if (i == t_rhs) { t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz]; printf(" --> %8s:%9.3f (%6.2f%%)\n", "sub-rhs", t, t*100./tmax); t = trecs[i] - t; printf(" --> %8s:%9.3f (%6.2f%%)\n", "rest-rhs", t, t*100./tmax); } } } release_opencl(); fflush(stdout); return 0; }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }