static int realmain(void *carg) { unsigned arg = (uintptr_t)carg; /*c------------------------------------------------------------------- c-------------------------------------------------------------------*/ int i, ierr; /*------------------------------------------------------------------ c u0, u1, u2 are the main arrays in the problem. c Depending on the decomposition, these arrays will have different c dimensions. To accomodate all possibilities, we allocate them as c one-dimensional arrays and pass them to subroutines for different c views c - u0 contains the initial (transformed) initial condition c - u1 and u2 are working arrays c - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the c time evolution operator. c-----------------------------------------------------------------*/ /*-------------------------------------------------------------------- c Large arrays are in common so that they are allocated on the c heap rather than the stack. This common block is not c referenced directly anywhere else. Padding is to avoid accidental c cache problems, since all array sizes are powers of two. c-------------------------------------------------------------------*/ static dcomplex u0[NZ][NY][NX]; static dcomplex pad1[3]; static dcomplex u1[NZ][NY][NX]; static dcomplex pad2[3]; static dcomplex u2[NZ][NY][NX]; static dcomplex pad3[3]; static int indexmap[NZ][NY][NX]; int iter; int nthreads = 1; double total_time, mflops; boolean verified; char class; omp_set_num_threads(arg); /*-------------------------------------------------------------------- c Run the entire problem once to make sure all data is touched. c This reduces variable startup costs, which is important for such a c short benchmark. The other NPB 2 implementations are similar. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } setup(); #pragma omp parallel { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } fft(1, u1, u0); } /* end parallel */ /*-------------------------------------------------------------------- c Start over from the beginning. Note that all operations must c be timed, in contrast to other benchmarks. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } timer_start(T_TOTAL); if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP); #pragma omp parallel private(iter) firstprivate(niter) { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_SETUP); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(1, u1, u0); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } for (iter = 1; iter <= niter; iter++) { if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_EVOLVE); } evolve(u0, u1, iter, indexmap, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_EVOLVE); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(-1, u1, u2); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_CHECKSUM); } checksum(iter, u2, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_CHECKSUM); } } #pragma omp single verify(NX, NY, NZ, niter, &verified, &class); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop(T_TOTAL); total_time = timer_read(T_TOTAL); if( total_time != 0.0) { mflops = 1.0e-6*(double)(NTOTAL) * (14.8157+7.19641*log((double)(NTOTAL)) + (5.23518+7.21113*log((double)(NTOTAL)))*niter) /total_time; } else { mflops = 0.0; } #ifdef BOMP backend_create_time(arg); #endif printf("Computetime %d %f\n", arg, total_time); printf("client done\n"); /* c_print_results("FT", class, NX, NY, NZ, niter, nthreads, */ /* total_time, mflops, " floating point", verified, */ /* NPBVERSION, COMPILETIME, */ /* CS1, CS2, CS3, CS4, CS5, CS6, CS7); */ if (TIMERS_ENABLED == TRUE) print_timers(); }
int main(int argc, char *argv[]) { int i; int iter; double total_time, mflops; logical verified; char Class; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } setup(); setup_opencl(argc, argv); init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); fft_init(dims[0]); fft(1, &m_u1, &m_u0); //--------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } timer_start(T_total); if (timers_enabled) timer_start(T_setup); DTIMER_START(T_compute_im); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_im); DTIMER_START(T_compute_ics); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_ics); DTIMER_START(T_fft_init); fft_init(dims[0]); DTIMER_STOP(T_fft_init); if (timers_enabled) timer_stop(T_setup); if (timers_enabled) timer_start(T_fft); fft(1, &m_u1, &m_u0); if (timers_enabled) timer_stop(T_fft); for (iter = 1; iter <= niter; iter++) { if (timers_enabled) timer_start(T_evolve); evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_evolve); if (timers_enabled) timer_start(T_fft); fft(-1, &m_u1, &m_u1); if (timers_enabled) timer_stop(T_fft); if (timers_enabled) timer_start(T_checksum); checksum(iter, &m_u1, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_checksum); } verify(NX, NY, NZ, niter, &verified, &Class); timer_stop(T_total); total_time = timer_read(T_total); if (total_time != 0.0) { mflops = 1.0e-6 * (double)NTOTAL * (14.8157 + 7.19641 * log((double)NTOTAL) + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter) / total_time; } else { mflops = 0.0; } c_print_results("FT", Class, NX, NY, NZ, niter, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) print_timers(); release_opencl(); fflush(stdout); return 0; }
void appft(int niter, double *total_time, logical *verified) { int i, j, k, kt, n12, n22, n32, ii, jj, kk, ii2, ik2; double ap; dcomplex exp1[NX], exp2[NY], exp3[NZ]; for (i = 1; i <= 15; i++) { timer_clear(i); } timer_start(2); compute_initial_conditions(NX, NY, NZ, xnt); CompExp(NX, exp1); CompExp(NY, exp2); CompExp(NZ, exp3); fftXYZ(1, NX, NY, NZ, xnt, (dcomplex *)y, exp1, exp2, exp3); timer_stop(2); timer_start(1); if (timers_enabled) timer_start(13); n12 = NX / 2; n22 = NY / 2; n32 = NZ / 2; ap = -4.0 * ALPHA * (PI * PI); for (i = 0; i < NZ; i++) { ii = i - (i / n32) * NZ; ii2 = ii * ii; for (k = 0; k < NY; k++) { kk = k - (k / n22) * NY; ik2 = ii2 + kk*kk; for (j = 0; j < NX; j++) { jj = j - (j / n12) * NX; twiddle[i][k][j] = exp(ap*(double)(jj*jj + ik2)); } } } if (timers_enabled) timer_stop(13); if (timers_enabled) timer_start(12); compute_initial_conditions(NX, NY, NZ, xnt); if (timers_enabled) timer_stop(12); if (timers_enabled) timer_start(15); fftXYZ(1, NX, NY, NZ, xnt, (dcomplex *)y, exp1, exp2, exp3); if (timers_enabled) timer_stop(15); for (kt = 1; kt <= niter; kt++) { if (timers_enabled) timer_start(11); evolve(NX, NY, NZ, xnt, y, twiddle); if (timers_enabled) timer_stop(11); if (timers_enabled) timer_start(15); fftXYZ(-1, NX, NY, NZ, xnt, (dcomplex *)xnt, exp1, exp2, exp3); if (timers_enabled) timer_stop(15); if (timers_enabled) timer_start(10); CalculateChecksum(&sums[kt], kt, NX, NY, NZ, xnt); if (timers_enabled) timer_stop(10); } // Verification test. if (timers_enabled) timer_start(14); verify(NX, NY, NZ, niter, sums, verified); if (timers_enabled) timer_stop(14); timer_stop(1); *total_time = timer_read(1); if (!timers_enabled) return; printf(" FT subroutine timers \n"); printf(" %26s =%9.4f\n", "FT total ", timer_read(1)); printf(" %26s =%9.4f\n", "WarmUp time ", timer_read(2)); printf(" %26s =%9.4f\n", "fftXYZ body ", timer_read(3)); printf(" %26s =%9.4f\n", "Swarztrauber ", timer_read(4)); printf(" %26s =%9.4f\n", "X time ", timer_read(7)); printf(" %26s =%9.4f\n", "Y time ", timer_read(8)); printf(" %26s =%9.4f\n", "Z time ", timer_read(9)); printf(" %26s =%9.4f\n", "CalculateChecksum ", timer_read(10)); printf(" %26s =%9.4f\n", "evolve ", timer_read(11)); printf(" %26s =%9.4f\n", "compute_initial_conditions", timer_read(12)); printf(" %26s =%9.4f\n", "twiddle ", timer_read(13)); printf(" %26s =%9.4f\n", "verify ", timer_read(14)); printf(" %26s =%9.4f\n", "fftXYZ ", timer_read(15)); printf(" %26s =%9.4f\n", "Benchmark time ", *total_time); }