int main( int argc, char **argv ) { int i, iteration, timer_on; double timecounter; FILE *fp; /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* Printout initial NPB info */ printf ( "\n\n NAS Parallel Benchmarks (NPB3.3-OMP) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); #ifdef _OPENMP printf( " Number of available threads: %d\n", omp_get_max_threads() ); #endif printf( "\n" ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ alloc_key_buff(); if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS ); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } return 0; /**************************/ } /* E N D P R O G R A M */
int main(int argc, char *argv[]) { /*------------------------------------------------------------------------- c k is the current level. It is passed down through subroutine args c and is NOT global. it is the current iteration c------------------------------------------------------------------------*/ int k, it; double t, tinit, mflops; int nthreads = 1; /*------------------------------------------------------------------------- c These arrays are in common because they are quite large c and probably shouldn''t be allocated on the stack. They c are always passed as subroutine args. c------------------------------------------------------------------------*/ double **u, *v, **r; double a[4], c[4]; double rnm2, rnmu; double epsilon = 1.0e-8; int n1, n2, n3, nit; double verify_value; boolean verified; int i, j, l; FILE *fp; timer_clear(T_BENCH); timer_clear(T_INIT); timer_start(T_INIT); /*---------------------------------------------------------------------- c Read in and broadcast input data c---------------------------------------------------------------------*/ printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - MG Benchmark\n\n"); fp = fopen("mg.input", "r"); if (fp != NULL) { printf(" Reading from input file mg.input\n"); fscanf(fp, "%d", <); while(fgetc(fp) != '\n'); fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]); while(fgetc(fp) != '\n'); fscanf(fp, "%d", &nit); while(fgetc(fp) != '\n'); for (i = 0; i <= 7; i++) { fscanf(fp, "%d", &debug_vec[i]); } fclose(fp); } else { printf(" No input file. Using compiled defaults\n"); lt = LT_DEFAULT; nit = NIT_DEFAULT; nx[lt] = NX_DEFAULT; ny[lt] = NY_DEFAULT; nz[lt] = NZ_DEFAULT; for (i = 0; i <= 7; i++) { debug_vec[i] = DEBUG_DEFAULT; } } if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) { Class = 'U'; } else if( nx[lt] == 32 && nit == 4 ) { Class = 'S'; } else if( nx[lt] == 64 && nit == 40 ) { Class = 'W'; } else if( nx[lt] == 256 && nit == 20 ) { Class = 'B'; } else if( nx[lt] == 512 && nit == 20 ) { Class = 'C'; } else if( nx[lt] == 256 && nit == 4 ) { Class = 'A'; } else { Class = 'U'; } /*-------------------------------------------------------------------- c Use these for debug info: c--------------------------------------------------------------------- c debug_vec(0) = 1 !=> report all norms c debug_vec(1) = 1 !=> some setup information c debug_vec(1) = 2 !=> more setup information c debug_vec(2) = k => at level k or below, show result of resid c debug_vec(3) = k => at level k or below, show result of psinv c debug_vec(4) = k => at level k or below, show result of rprj c debug_vec(5) = k => at level k or below, show result of interp c debug_vec(6) = 1 => (unused) c debug_vec(7) = 1 => (unused) c-------------------------------------------------------------------*/ a[0] = -8.0/3.0; a[1] = 0.0; a[2] = 1.0/6.0; a[3] = 1.0/12.0; if (Class == 'A' || Class == 'S' || Class =='W') { /*-------------------------------------------------------------------- c Coefficients for the S(a) smoother c-------------------------------------------------------------------*/ c[0] = -3.0/8.0; c[1] = 1.0/32.0; c[2] = -1.0/64.0; c[3] = 0.0; } else { /*-------------------------------------------------------------------- c Coefficients for the S(b) smoother c-------------------------------------------------------------------*/ c[0] = -3.0/17.0; c[1] = 1.0/33.0; c[2] = -1.0/61.0; c[3] = 0.0; } lb = 1; setup(&n1,&n2,&n3,lt); /* Allocate the data arrays * 3d arrays are flattened and allocated as a contiguous block * 4d arrays are allocated as separate 3d blocks */ u = (double **)malloc((lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) u[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double)); v = (double *)malloc(m3[lt]*m2[lt]*m1[lt]*sizeof(double)); r = (double **)malloc((lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) r[l] = (double *)malloc(m3[l]*m2[l]*m1[l]*sizeof(double)); // Array v can be treated using a standard OpenACC data region #pragma acc data create(v[0:m3[lt]*m2[lt]*m1[lt]]) copyin(a[0:4],c[0:4]) { #ifdef _OPENACC //**************************************************************** /* Now manually deep-create arrays u,r on the GPU using the Cray extended * runtime API, instead of using a data region */ double **acc_u = (double **)cray_acc_create(u,(lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) { double *acc_ul = (double *)cray_acc_create(u[l],m3[l]*m2[l]*m1[l]*sizeof(double)); SET_ACC_PTR(acc_u[l], acc_ul); } double **acc_r = (double **)cray_acc_create(r,(lt+1)*sizeof(double *)); for (l=lt; l >=1; l--) { double *acc_rl = (double *)cray_acc_create(r[l],m3[l]*m2[l]*m1[l]*sizeof(double)); SET_ACC_PTR(acc_r[l], acc_rl); } //**************************************************************** #endif /* _OPENACC */ #pragma omp parallel { zero3(u[lt],n1,n2,n3); } zran3(v,n1,n2,n3,nx[lt],ny[lt],lt); #pragma omp parallel { norm2u3(v,n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); #pragma omp single { /* printf("\n norms of random v are\n"); printf(" %4d%19.12e%19.12e\n", 0, rnm2, rnmu); printf(" about to evaluate resid, k= %d\n", lt);*/ printf(" Size: %3dx%3dx%3d (class %1c)\n", nx[lt], ny[lt], nz[lt], Class); printf(" Iterations: %3d\n", nit); } resid(u[lt],v,r[lt],n1,n2,n3,a,lt); norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); /*c--------------------------------------------------------------------- c One iteration for startup c---------------------------------------------------------------------*/ mg3P(u,v,r,a,c,n1,n2,n3,lt); resid(u[lt],v,r[lt],n1,n2,n3,a,lt); #pragma omp single setup(&n1,&n2,&n3,lt); zero3(u[lt],n1,n2,n3); } /* pragma omp parallel */ zran3(v,n1,n2,n3,nx[lt],ny[lt],lt); timer_stop(T_INIT); timer_start(T_BENCH); #pragma omp parallel firstprivate(nit) private(it) { resid(u[lt],v,r[lt],n1,n2,n3,a,lt); norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); for ( it = 1; it <= nit; it++) { mg3P(u,v,r,a,c,n1,n2,n3,lt); resid(u[lt],v,r[lt],n1,n2,n3,a,lt); } norm2u3(r[lt],n1,n2,n3,&rnm2,&rnmu,nx[lt],ny[lt],nz[lt]); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif } /* pragma omp parallel */ timer_stop(T_BENCH); t = timer_read(T_BENCH); tinit = timer_read(T_INIT); verified = FALSE; verify_value = 0.0; printf(" Initialization time: %15.3f seconds\n", tinit); printf(" Benchmark completed\n"); if (Class != 'U') { if (Class == 'S') { verify_value = 0.530770700573e-04; } else if (Class == 'W') { verify_value = 0.250391406439e-17; /* 40 iterations*/ /* 0.183103168997d-044 iterations*/ } else if (Class == 'A') { verify_value = 0.2433365309e-5; } else if (Class == 'B') { verify_value = 0.180056440132e-5; } else if (Class == 'C') { verify_value = 0.570674826298e-06; } if ( fabs( rnm2 - verify_value ) <= epsilon ) { verified = TRUE; printf(" VERIFICATION SUCCESSFUL\n"); printf(" L2 Norm is %20.12e\n", rnm2); printf(" Error is %20.12e\n", rnm2 - verify_value); } else { verified = FALSE; printf(" VERIFICATION FAILED\n"); printf(" L2 Norm is %20.12e\n", rnm2); printf(" The correct L2 Norm is %20.12e\n", verify_value); } } else { verified = FALSE; printf(" Problem size unknown\n"); printf(" NO VERIFICATION PERFORMED\n"); } if ( t != 0.0 ) { int nn = nx[lt]*ny[lt]*nz[lt]; mflops = 58.*nit*nn*1.0e-6 / t; } else { mflops = 0.0; } c_print_results("MG", Class, nx[lt], ny[lt], nz[lt], nit, nthreads, t, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); // I should probably deep-free the manually deep-created accelerator data here } //acc end data }
int main(int argc, char *argv[]) { int i; int iter; double total_time, mflops; logical verified; char Class; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Run the entire problem once to make sure all data is touched. // This reduces variable startup costs, which is important for such a // short benchmark. The other NPB 2 implementations are similar. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } setup(); setup_opencl(argc, argv); init_ui(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); fft_init(dims[0]); fft(1, &m_u1, &m_u0); //--------------------------------------------------------------------- // Start over from the beginning. Note that all operations must // be timed, in contrast to other benchmarks. //--------------------------------------------------------------------- for (i = 1; i <= T_max; i++) { timer_clear(i); } timer_start(T_total); if (timers_enabled) timer_start(T_setup); DTIMER_START(T_compute_im); compute_indexmap(&m_twiddle, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_im); DTIMER_START(T_compute_ics); compute_initial_conditions(&m_u1, dims[0], dims[1], dims[2]); DTIMER_STOP(T_compute_ics); DTIMER_START(T_fft_init); fft_init(dims[0]); DTIMER_STOP(T_fft_init); if (timers_enabled) timer_stop(T_setup); if (timers_enabled) timer_start(T_fft); fft(1, &m_u1, &m_u0); if (timers_enabled) timer_stop(T_fft); for (iter = 1; iter <= niter; iter++) { if (timers_enabled) timer_start(T_evolve); evolve(&m_u0, &m_u1, &m_twiddle, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_evolve); if (timers_enabled) timer_start(T_fft); fft(-1, &m_u1, &m_u1); if (timers_enabled) timer_stop(T_fft); if (timers_enabled) timer_start(T_checksum); checksum(iter, &m_u1, dims[0], dims[1], dims[2]); if (timers_enabled) timer_stop(T_checksum); } verify(NX, NY, NZ, niter, &verified, &Class); timer_stop(T_total); total_time = timer_read(T_total); if (total_time != 0.0) { mflops = 1.0e-6 * (double)NTOTAL * (14.8157 + 7.19641 * log((double)NTOTAL) + (5.23518 + 7.21113 * log((double)NTOTAL)) * niter) / total_time; } else { mflops = 0.0; } c_print_results("FT", Class, NX, NY, NZ, niter, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) print_timers(); release_opencl(); fflush(stdout); return 0; }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; int np, ierr, node, no_nodes, i, ik, kk, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; int nthreads = 1; boolean verified; char size[13+1]; /* character*13 */ /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 3.0 structured OpenMP C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); #pragma omp parallel for default(shared) private(i) for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; for ( i = 0; i <= NQ - 1; i++) { q[i] = 0.0; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; #pragma omp parallel copyin(x) { double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double qq[NQ]; /* private copy of q[0:NQ-1] */ for (i = 0; i < NQ; i++) qq[i] = 0.0; #pragma omp for reduction(+:sx,sy) schedule(static) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = randlc(&t1, t2); if (ik == 0) break; t3 = randlc(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ if (TIMERS_ENABLED == TRUE) timer_start(3); vranlc(2*NK, &t1, A, x-1); if (TIMERS_ENABLED == TRUE) timer_stop(3); /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ if (TIMERS_ENABLED == TRUE) timer_start(2); for ( i = 0; i < NK; i++) { x1 = 2.0 * x[2*i] - 1.0; x2 = 2.0 * x[2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[l] += 1.0; /* counts */ sx = sx + t3; /* sum of Xi */ sy = sy + t4; /* sum of Yi */ } } if (TIMERS_ENABLED == TRUE) timer_stop(2); } #pragma omp critical { for (i = 0; i <= NQ - 1; i++) q[i] += qq[i]; } #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end of parallel region */ for (i = 0; i <= NQ-1; i++) { gc = gc + q[i]; } timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (TIMERS_ENABLED == TRUE) { printf("Total time: %f", timer_read(1)); printf("Gaussian pairs: %f", timer_read(2)); printf("Random numbers: %f", timer_read(3)); } }
int main( int argc, char **argv ) { MPI_Init(&argc,&argv); INT_TYPE chunk; int ini, fim; int i, j, iteration, timer_on; double timecounter; FILE *fp; int myrank; MPI_Status st; MPI_Comm_rank(MPI_COMM_WORLD,&myrank); MPI_Comm_size(MPI_COMM_WORLD,&NUM_THREADS); if (myrank == 0) { /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* Printout initial NPB info */ printf ( "\n\n NAS Parallel Benchmarks (NPB3.3-SER) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Number of available threads: %d\n", NUM_THREADS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); } R23 = pow(2, -23); T23 = pow(2, 23); R46 = pow(2, -46); T46 = pow(2, 46); /* Generate random number sequence and subsequent keys on all procs */ create_seq(myrank); if (myrank == 0) { // sincronizar resultados for (i = 1; i < NUM_THREADS; i++) { chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS; ini = chunk * i; fim = ini + chunk; if ( fim > NUM_KEYS ) { fim = NUM_KEYS; } MPI_Recv( &aux_key_array[ini], (fim - ini), MPI_INT, i, 0, MPI_COMM_WORLD, &st ); for (j = ini; j < fim; j++) { key_array[j] = aux_key_array[j]; } } } else { chunk = (NUM_KEYS + NUM_THREADS - 1) / NUM_THREADS; ini = chunk * myrank; fim = ini + chunk; if ( fim > NUM_KEYS ) { fim = NUM_KEYS; } // enviar resultados MPI_Send( &key_array[ini], (fim - ini), MPI_INT, 0, 0, MPI_COMM_WORLD ); } if (myrank == 0) { if (timer_on) { timer_stop( 1 ); } /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS ); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } } MPI_Finalize(); return 0; /**************************/ } /* E N D P R O G R A M */
int main( int argc, char **argv ) { int i, iteration, itemp; double timecounter, maxtime; /* Initialize MPI */ MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &my_rank ); MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* Printout initial NPB info */ if( my_rank == 0 ) { FILE *fp; printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); printf( " Number of processes: %d\n", comm_size ); fp = fopen("timer.flag", "r"); timeron = 0; if (fp) { timeron = 1; fclose(fp); } } /* Check that actual and compiled number of processors agree */ if( comm_size != NUM_PROCS ) { if( my_rank == 0 ) printf( "\n ERROR: compiled for %d processes\n" " Number of active processes: %d\n" " Exiting program!\n\n", NUM_PROCS, comm_size ); MPI_Finalize(); exit( 1 ); } /* Check to see whether total number of processes is within bounds. This could in principle be checked in setparams.c, but it is more convenient to do it here */ if( comm_size < MIN_PROCS || comm_size > MAX_PROCS) { if( my_rank == 0 ) printf( "\n ERROR: number of processes %d not within range %d-%d" "\n Exiting program!\n\n", comm_size, MIN_PROCS, MAX_PROCS); MPI_Finalize(); exit( 1 ); } MPI_Bcast(&timeron, 1, MPI_INT, 0, MPI_COMM_WORLD); #ifdef TIMING_ENABLED for( i=1; i<=T_LAST; i++ ) timer_clear( i ); #endif /* Generate random number sequence and subsequent keys on all procs */ create_seq( find_my_seed( my_rank, comm_size, 4*(long)TOTAL_KEYS*MIN_PROCS, 314159265.00, /* Random number gen seed */ 1220703125.00 ), /* Random number gen mult */ 1220703125.00 ); /* Random number gen mult */ /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; if( my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" ); /* Initialize timer */ timer_clear( 0 ); /* Initialize separate communication, computation timing */ #ifdef TIMING_ENABLED for( i=1; i<=T_LAST; i++ ) timer_clear( i ); #endif /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } /* Stop timer, obtain time for processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* End of timing, obtain maximum time of all processors */ MPI_Reduce( &timecounter, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ full_verify(); /* Obtain verification counter sum */ itemp = passed_verification; MPI_Reduce( &itemp, &passed_verification, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD ); /* The final printout */ if( my_rank == 0 ) { if( passed_verification != 5*MAX_ITERATIONS + comm_size ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS), MIN_PROCS, 0, MAX_ITERATIONS, NUM_PROCS, comm_size, maxtime, ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS) /maxtime/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, MPICC, CLINK, CMPI_LIB, CMPI_INC, CFLAGS, CLINKFLAGS ); } #ifdef TIMING_ENABLED if (timeron) { double t1[T_LAST+1], tmin[T_LAST+1], tsum[T_LAST+1], tmax[T_LAST+1]; char t_recs[T_LAST+1][9]; for( i=0; i<=T_LAST; i++ ) t1[i] = timer_read( i ); MPI_Reduce( t1, tmin, T_LAST+1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD ); MPI_Reduce( t1, tsum, T_LAST+1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD ); MPI_Reduce( t1, tmax, T_LAST+1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD ); if( my_rank == 0 ) { strcpy( t_recs[T_TOTAL], "total" ); strcpy( t_recs[T_RANK], "rcomp" ); strcpy( t_recs[T_RCOMM], "rcomm" ); strcpy( t_recs[T_VERIFY], "verify"); printf( " nprocs = %6d ", comm_size); printf( " minimum maximum average\n" ); for( i=0; i<=T_LAST; i++ ) { printf( " timer %2d (%-8s): %10.4f %10.4f %10.4f\n", i+1, t_recs[i], tmin[i], tmax[i], tsum[i]/((double) comm_size) ); } printf( "\n" ); } } #endif MPI_Finalize(); return 0; /**************************/ } /* E N D P R O G R A M */
int main(int argc, char** argv ) { int i, iteration, itemp; int nthreads = 1; double timecounter, maxtime; /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; }; /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - IS Benchmark\n\n" ); printf( " Size: %d (class %c)\n", TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); /* Initialize timer */ timer_clear( 0 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ #pragma omp parallel rank( 1 ); /* Start verification counter */ passed_verification = 0; if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ #pragma omp parallel private(iteration) for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { #pragma omp master if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ full_verify(); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, TOTAL_KEYS, 0, 0, MAX_ITERATIONS, nthreads, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "randlc2"); return 0; /**************************/ } /* E N D P R O G R A M */
int main(int argc,char **argv ){ int my_rank,comm_size; int i; DGraph *dg=NULL; int verified=0, featnum=0; double bytes_sent=2.0,tot_time=0.0; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &my_rank ); MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); TRACE_smpi_set_category ("begin"); if(argc!=2|| ( strncmp(argv[1],"BH",2)!=0 &&strncmp(argv[1],"WH",2)!=0 &&strncmp(argv[1],"SH",2)!=0 ) ){ if(my_rank==0){ fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n"); fprintf(stderr,"** Where \n - N is integer number of MPI processes\n"); fprintf(stderr," - S is the class S, W, or A \n"); fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n"); fprintf(stderr," - the number of MPI processes N should not be be less than \n"); fprintf(stderr," the number of nodes in the graph\n"); } MPI_Finalize(); exit(0); } if(strncmp(argv[1],"BH",2)==0){ dg=buildBH(CLASS); }else if(strncmp(argv[1],"WH",2)==0){ dg=buildWH(CLASS); }else if(strncmp(argv[1],"SH",2)==0){ dg=buildSH(CLASS); } if(timer_on&&dg->numNodes+1>timers_tot){ timer_on=0; if(my_rank==0) fprintf(stderr,"Not enough timers. Node timeing is off. \n"); } if(dg->numNodes>comm_size){ if(my_rank==0){ fprintf(stderr,"** The number of MPI processes should not be less than \n"); fprintf(stderr,"** the number of nodes in the graph\n"); fprintf(stderr,"** Number of MPI processes = %d\n",comm_size); fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes); } MPI_Finalize(); exit(0); } for(i=0;i<dg->numNodes;i++){ dg->node[i]->address=i; } if( my_rank == 0 ){ printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" ); graphShow(dg,0); timer_clear(0); timer_start(0); } verified=ProcessNodes(dg,my_rank); TRACE_smpi_set_category ("end"); featnum=NUM_SAMPLES*fielddim; bytes_sent=featnum*dg->numArcs; bytes_sent/=1048576; if(my_rank==0){ timer_stop(0); tot_time=timer_read(0); c_print_results( dg->name, CLASS, featnum, 0, 0, dg->numNodes, 0, comm_size, tot_time, bytes_sent/tot_time, "bytes transmitted", verified, NPBVERSION, COMPILETIME, MPICC, CLINK, CMPI_LIB, CMPI_INC, CFLAGS, CLINKFLAGS ); } MPI_Finalize(); return 1; }
int main(int argc, char *argv[]) { char Class; logical verified; double mflops; double t, tmax, trecs[t_last+1]; int i; char *t_names[t_last+1]; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } //--------------------------------------------------------------------- // Setup info for timers //--------------------------------------------------------------------- FILE *fp; if ((fp = fopen("timer.flag", "r")) != NULL) { timeron = true; t_names[t_total] = "total"; t_names[t_rhsx] = "rhsx"; t_names[t_rhsy] = "rhsy"; t_names[t_rhsz] = "rhsz"; t_names[t_rhs] = "rhs"; t_names[t_jacld] = "jacld"; t_names[t_blts] = "blts"; t_names[t_jacu] = "jacu"; t_names[t_buts] = "buts"; t_names[t_add] = "add"; t_names[t_l2norm] = "l2norm"; t_names[t_setbv] = "setbv"; t_names[t_setiv] = "setiv"; t_names[t_erhs] = "erhs"; t_names[t_error] = "error"; t_names[t_pintgr] = "pintgr"; t_names[t_blts1] = "blts1"; t_names[t_buts1] = "buts1"; fclose(fp); } else { timeron = false; } //--------------------------------------------------------------------- // read input data //--------------------------------------------------------------------- read_input(); //--------------------------------------------------------------------- // set up domain sizes //--------------------------------------------------------------------- domain(); //--------------------------------------------------------------------- // set up OpenCL environment //--------------------------------------------------------------------- setup_opencl(argc, argv); //--------------------------------------------------------------------- // set up coefficients //--------------------------------------------------------------------- setcoeff(); //--------------------------------------------------------------------- // set the boundary values for dependent variables //--------------------------------------------------------------------- setbv(); //--------------------------------------------------------------------- // set the initial values for dependent variables //--------------------------------------------------------------------- setiv(); //--------------------------------------------------------------------- // compute the forcing term based on prescribed exact solution //--------------------------------------------------------------------- erhs(); //--------------------------------------------------------------------- // perform one SSOR iteration to touch all data pages //--------------------------------------------------------------------- ssor(1); //--------------------------------------------------------------------- // reset the boundary and initial values //--------------------------------------------------------------------- setbv(); setiv(); //--------------------------------------------------------------------- // perform the SSOR iterations //--------------------------------------------------------------------- ssor(itmax); //--------------------------------------------------------------------- // compute the solution error //--------------------------------------------------------------------- error(); //--------------------------------------------------------------------- // compute the surface integral //--------------------------------------------------------------------- pintgr(); //--------------------------------------------------------------------- // verification test //--------------------------------------------------------------------- verify ( rsdnm, errnm, frc, &Class, &verified ); mflops = (double)itmax * (1984.77 * (double)nx0 * (double)ny0 * (double)nz0 - 10923.3 * pow(((double)(nx0+ny0+nz0)/3.0), 2.0) + 27770.9 * (double)(nx0+ny0+nz0)/3.0 - 144010.0) / (maxtime*1000000.0); c_print_results("LU", Class, nx0, ny0, nz0, itmax, maxtime, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, "(none)", clu_GetDeviceTypeName(device_type), device_name); //--------------------------------------------------------------------- // More timers //--------------------------------------------------------------------- if (timeron) { for (i = 1; i <= t_last; i++) { trecs[i] = timer_read(i); } tmax = maxtime; if (tmax == 0.0) tmax = 1.0; printf(" SECTION Time (secs)\n"); for (i = 1; i <= t_last; i++) { printf(" %-8s:%9.4f (%6.2f%%)\n", t_names[i], trecs[i], trecs[i]*100./tmax); if (i == t_rhs) { t = trecs[t_rhsx] + trecs[t_rhsy] + trecs[t_rhsz]; printf(" --> %8s:%9.3f (%6.2f%%)\n", "sub-rhs", t, t*100./tmax); t = trecs[i] - t; printf(" --> %8s:%9.3f (%6.2f%%)\n", "rest-rhs", t, t*100./tmax); } } } release_opencl(); fflush(stdout); return 0; }
/* c This is the serial version of the APP Benchmark 1, c the "embarassingly parallel" benchmark. c c M is the Log_2 of the number of complex pairs of uniform (0, 1) random c numbers. MK is the Log_2 of the size of each batch of uniform random c numbers. MK can be set for convenience on a given system, since it does c not affect the results. */ int main(int argc, char **argv) { double *x, **xx, *q, **qq; double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc; double dum[3] = { 1.0, 1.0, 1.0 }; const int TRANSFER_X = 1; int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode, no_large_nodes, np_add, k_offset, j; double loc_x,loc_t1,loc_t2,loc_t3,loc_t4; double loc_a1,loc_a2,loc_x1,loc_x2,loc_z; boolean verified; char size[13+1]; /* character*13 */ /* Allocate working memory */ x = (double*) malloc(sizeof(double) * 2*NK); xx = (double**) malloc(sizeof(double*) * NN); xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK); for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK); q = (double*) malloc(sizeof(double) * NQ); qq = (double**) malloc(sizeof(double*) * NN); qq[0] = (double*) malloc(sizeof(double) * NN * NQ); for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ; /* c Because the size of the problem is too large to store in a 32-bit c integer for some classes, we put it into a string (for printing). c Have to strip off the decimal point put in there by the floating c point print statement (internal file) */ printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version" " - EP Benchmark\n"); sprintf(size, "%12.0f", pow(2.0, M+1)); for (j = 13; j >= 1; j--) { if (size[j] == '.') size[j] = ' '; } printf(" Number of random numbers generated: %13s\n", size); verified = FALSE; /* c Compute the number of "batches" of random number pairs generated c per processor. Adjust if the number of processors does not evenly c divide the total number */ np = NN; /* c Call the random number generator functions and initialize c the x-array to reduce the effects of paging on the timings. c Also, call all mathematical functions that are used. Make c sure these initializations cannot be eliminated as dead code. */ #pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \ copyout(q[0:NQ]) { vranlc(0, &(dum[0]), dum[1], &(dum[2])); dum[0] = randlc(&(dum[1]), dum[2]); for (i = 0; i < 2*NK; i++) x[i] = -1.0e99; Mops = log(sqrt(fabs(max(1.0, 1.0)))); timer_clear(1); timer_clear(2); timer_clear(3); timer_start(1); vranlc(0, &t1, A, x); #pragma acc update device(x[0:2*NK]) /* Compute AN = A ^ (2 * NK) (mod 2^46). */ t1 = A; for ( i = 1; i <= MK+1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; gc = 0.0; sx = 0.0; sy = 0.0; #pragma acc parallel loop for (k = 0; k < np; k++) { /* Initialize private q (qq) */ #pragma acc loop for (i = 0; i < NQ; i++) qq[k][i] = 0.0; /* Initialize private x (xx) */ #pragma acc loop for (i = 0; i < 2*NK; i++) xx[k][i] = x[i]; } /* c Each instance of this loop may be performed independently. We compute c the k offsets separately to take into account the fact that some nodes c have more numbers to generate than others */ k_offset = -1; double t1, t2, t3, t4, x1, x2; int kk, i, ik, l; double psx, psy; #pragma acc parallel loop reduction(+:sx,sy) for (k = 1; k <= np; k++) { kk = k_offset + k; t1 = S; t2 = an; /* Find starting seed t1 for this kk. */ #pragma acc loop seq for (i = 1; i <= 100; i++) { ik = kk / 2; if (2 * ik != kk) t3 = RANDLC(&t1, t2); if (ik == 0) break; t3 = RANDLC(&t2, t2); kk = ik; } /* Compute uniform pseudorandom numbers. */ loc_t1 = r23 * A; loc_a1 = (int)loc_t1; loc_a2 = A - t23 * loc_a1; loc_x = t1; #pragma acc loop seq for (i = 1; i <= 2*NK; i++) { loc_t1 = r23 * loc_x; loc_x1 = (int)loc_t1; loc_x2 = loc_x - t23 * loc_x1; loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1; loc_t2 = (int)(r23 * loc_t1); loc_z = loc_t1 - t23 * loc_t2; loc_t3 = t23 * loc_z + loc_a2 * loc_x2; loc_t4 = (int)(r46 * loc_t3); loc_x = loc_t3 - t46 * loc_t4; xx[k-1][i-1] = r46 * loc_x; } t1 = loc_x; /* c Compute Gaussian deviates by acceptance-rejection method and c tally counts in concentric square annuli. This loop is not c vectorizable. */ psx = psy = 0.0; #pragma acc loop reduction(+:psx,psy) for ( i = 0; i < NK; i++) { x1 = 2.0 * xx[k-1][2*i] - 1.0; x2 = 2.0 * xx[k-1][2*i+1] - 1.0; t1 = pow2(x1) + pow2(x2); if (t1 <= 1.0) { t2 = sqrt(-2.0 * log(t1) / t1); t3 = (x1 * t2); /* Xi */ t4 = (x2 * t2); /* Yi */ l = max(fabs(t3), fabs(t4)); qq[k-1][l] += 1.0; /* counts */ psx = psx + t3; /* sum of Xi */ psy = psy + t4; /* sum of Yi */ } } sx += psx; sy += psy; } /* Reduce private qq to q */ #pragma acc parallel loop reduction(+:gc) for ( i = 0; i < NQ; i++ ) { double sumq = 0.0; #pragma acc loop reduction(+:sumq) for (k = 0; k < np; k++) sumq = sumq + qq[k][i]; q[i] = sumq; gc += sumq; } } /* end acc data */ timer_stop(1); tm = timer_read(1); nit = 0; if (M == 24) { if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) && (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 25) { if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) && (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 28) { if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) && (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 30) { if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) && (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) { verified = TRUE; } } else if (M == 32) { if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) && (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) { verified = TRUE; } } Mops = pow(2.0, M+1)/tm/1000000.0; printf("EP Benchmark Results: \n" "CPU Time = %10.4f\n" "N = 2^%5d\n" "No. Gaussian Pairs = %15.0f\n" "Sums = %25.15e %25.15e\n" "Counts:\n", tm, M, gc, sx, sy); for (i = 0; i <= NQ-1; i++) { printf("%3d %15.0f\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); return 0; }
int main( int argc, char **argv ) { int i, iteration; double timecounter; FILE *fp; cl_int ecode; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* set up the OpenCL environment. */ setup_opencl(argc, argv); /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; DTIMER_START(T_BUFFER_WRITE); ecode = clEnqueueWriteBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_WRITE); if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_READ); /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "", clu_GetDeviceTypeName(device_type), device_name); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } release_opencl(); fflush(stdout); return 0; /**************************/ } /* E N D P R O G R A M */
int main (int argc, char **argv) { //auto double *_ppthd_x; auto double Mops; auto double t1; auto double t2; auto double t3; auto double t4; auto double x1; auto double x2; auto double sx; auto double sy; auto double tm; auto double an; auto double tt; auto double gc; auto double dum[3]; auto int np; auto int ierr; auto int node; auto int no_nodes; auto int i; auto int ik; auto int kk; auto int l; auto int k; auto int nit; auto int ierrcode; auto int no_large_nodes; auto int np_add; auto int k_offset; auto int j; auto int nthreads; auto int verified; auto char size[14]; int status = 0; _ompc_init(argc,argv); //(_ppthd_x) = (((double *) (_ompc_get_thdprv (&_thdprv_x, 1048576, x)))); (*(dum)) = (1.0); (*((dum) + (1))) = (1.0); (*((dum) + (2))) = (1.0); (nthreads) = (1); # 84 "ep.c" printf ("\012\012 NAS Parallel Benchmarks 2.3 OpenMP C version - EP Benchmark\012"); # 86 "ep.c" sprintf (size, "%12.0f", pow (2.0, (28) + (1))); # 87 "ep.c" for ((j) = (13); (j) >= (1); (j)--) { # 88 "ep.c" if ((((int) (*((size) + (j))))) == (46)) { (*((size) + (j))) = (((char) (32))); } } # 90 "ep.c" printf (" Number of random numbers generated: %13s\012", size); # 92 "ep.c" (verified) = (0); # 99 "ep.c" (np) = ((1) << ((28) - (16))); # 107 "ep.c" vranlc (0, (dum) + (0), *((dum) + (1)), (dum) + (2)); # 108 "ep.c" (*((dum) + (0))) = (randlc ((dum) + (1), *((dum) + (2)))); # 109 "ep.c" for ((i) = (0); (i) < ((2) * ((1) << (16))); (i)++) { x[i] = (-(1.0E99)); //(*((_ppthd_x) + (i))) = (-(1.0E99)); } # 110 "ep.c" (Mops) = (log (sqrt (fabs (((1.0) > (1.0)) ? (1.0) : (1.0))))); # 112 "ep.c" timer_clear (1); # 113 "ep.c" timer_clear (2); # 114 "ep.c" timer_clear (3); # 115 "ep.c" timer_start (1); # 117 "ep.c" vranlc (0, &(t1), 1.220703125E9, x); //vranlc (0, &(t1), 1.220703125E9, _ppthd_x); # 121 "ep.c" (t1) = (1.220703125E9); # 123 "ep.c" for ((i) = (1); (i) <= ((16) + (1)); (i)++) { # 124 "ep.c" (t2) = (randlc (&(t1), t1)); } # 127 "ep.c" (an) = (t1); # 128 "ep.c" (tt) = (2.71828183E8); # 129 "ep.c" (gc) = (0.0); # 130 "ep.c" (sx) = (0.0); # 131 "ep.c" (sy) = (0.0); # 133 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 134 "ep.c" (*((q) + (i))) = (0.0); } # 142 "ep.c" (k_offset) = (-(1)); { auto void *__ompc_argv[6]; (*(__ompc_argv)) = (((void *) (&sx))); (*((__ompc_argv) + (1))) = (((void *) (&sy))); (*((__ompc_argv) + (2))) = (((void *) (&np))); (*((__ompc_argv) + (3))) = (((void *) (&k_offset))); (*((__ompc_argv) + (4))) = (((void *) (&an))); (*((__ompc_argv) + (5))) = (((void *) (&nthreads))); _ompc_do_parallel (__ompc_func_3, __ompc_argv); } # 207 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 208 "ep.c" (gc) = ((gc) + (*((q) + (i)))); } # 211 "ep.c" timer_stop (1); # 212 "ep.c" (tm) = (timer_read (1)); # 214 "ep.c" (nit) = (0); # 215 "ep.c" if ((28) == (24)) { # 216 "ep.c" if (((fabs (((sx) - (-(3247.83465203474))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6958.407078382297))) / (sy))) <= (1.0E-8))) { # 218 "ep.c" (verified) = (1); } } else # 220 "ep.c" if ((28) == (25)) { # 221 "ep.c" if (((fabs (((sx) - (-(2863.319731645753))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(6320.053679109499))) / (sy))) <= (1.0E-8))) { # 223 "ep.c" (verified) = (1); } } else # 225 "ep.c" if ((28) == (28)) { # 226 "ep.c" if (((fabs (((sx) - (-(4295.875165629892))) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(15807.32573678431))) / (sy))) <= (1.0E-8))) { # 228 "ep.c" (verified) = (1); printf("Debug:ompc_manual. 359, sx is:%f, sy is:%f\n",sx,sy); } } else # 230 "ep.c" if ((28) == (30)) { # 231 "ep.c" if (((fabs (((sx) - (40338.15542441498)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(26606.69192809235))) / (sy))) <= (1.0E-8))) { # 233 "ep.c" (verified) = (1); } } else # 235 "ep.c" if ((28) == (32)) { # 236 "ep.c" if (((fabs (((sx) - (47643.67927995374)) / (sx))) <= (1.0E-8)) && ((fabs (((sy) - (-(80840.72988043731))) / (sy))) <= (1.0E-8))) { # 238 "ep.c" (verified) = (1); } } # 242 "ep.c" (Mops) = (((pow (2.0, (28) + (1))) / (tm)) / (1000000.0)); # 244 "ep.c" printf ("EP Benchmark Results: \012CPU Time = %10.4f\012N = 2^%5d\012No. Gaussian Pairs = %15.0f\012Sums = %25.15e %25.15e\012Counts:\012", tm, 28, gc, sx, sy); # 251 "ep.c" for ((i) = (0); (i) <= ((10) - (1)); (i)++) { # 252 "ep.c" printf ("%3d %15.0f\012", i, *((q) + (i))); } # 255 "ep.c" c_print_results ("EP", 65, (28) + (1), 0, 0, nit, nthreads, tm, Mops, "Random numbers generated", verified, "2.3", "07 Aug 2006", "omcc", "$(CC)", "(none)", "-I../common", "-t", "-lm", "randdp"); # 261 "ep.c" if ((0) == (1)) { # 262 "ep.c" printf ("Total time: %f", timer_read (1)); # 263 "ep.c" printf ("Gaussian pairs: %f", timer_read (2)); # 264 "ep.c" printf ("Random numbers: %f", timer_read (3)); } }
int main(int argc, char *argv[]) { double Mops, t1, t2; double tsx, tsy, tm, an, tt, gc; double sx_verify_value, sy_verify_value, sx_err, sy_err; int i, nit; int k_offset, j; logical verified; char size[16]; FILE *fp; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } if ((fp = fopen("timer.flag", "r")) == NULL) { timers_enabled = false; } else { timers_enabled = true; fclose(fp); } //-------------------------------------------------------------------- // Because the size of the problem is too large to store in a 32-bit // integer for some classes, we put it into a string (for printing). // Have to strip off the decimal point put in there by the floating // point print statement (internal file) //-------------------------------------------------------------------- sprintf(size, "%15.0lf", pow(2.0, M+1)); j = 14; if (size[j] == '.') j--; size[j+1] = '\0'; printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n"); printf("\n Number of random numbers generated: %15s\n", size); verified = false; //-------------------------------------------------------------------- // Compute the number of "batches" of random number pairs generated // per processor. Adjust if the number of processors does not evenly // divide the total number //-------------------------------------------------------------------- np = NN; setup_opencl(argc, argv); timer_clear(0); timer_start(0); //-------------------------------------------------------------------- // Compute AN = A ^ (2 * NK) (mod 2^46). //-------------------------------------------------------------------- t1 = A; for (i = 0; i < MK + 1; i++) { t2 = randlc(&t1, t1); } an = t1; tt = S; //-------------------------------------------------------------------- // Each instance of this loop may be performed independently. We compute // the k offsets separately to take into account the fact that some nodes // have more numbers to generate than others //-------------------------------------------------------------------- k_offset = -1; DTIMER_START(T_KERNEL_EMBAR); // Launch the kernel int q_size = GROUP_SIZE * NQ * sizeof(cl_double); int sx_size = GROUP_SIZE * sizeof(cl_double); int sy_size = GROUP_SIZE * sizeof(cl_double); err_code = clSetKernelArg(kernel, 0, q_size, NULL); err_code |= clSetKernelArg(kernel, 1, sx_size, NULL); err_code |= clSetKernelArg(kernel, 2, sy_size, NULL); err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq); err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx); err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy); err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset); err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an); clu_CheckError(err_code, "clSetKernelArg()"); size_t localWorkSize[] = { GROUP_SIZE }; size_t globalWorkSize[] = { np }; err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueNDRangeKernel()"); CHECK_FINISH(); DTIMER_STOP(T_KERNEL_EMBAR); double (*gq)[NQ] = (double (*)[NQ])malloc(gq_size); double *gsx = (double*)malloc(gsx_size); double *gsy = (double*)malloc(gsy_size); gc = 0.0; tsx = 0.0; tsy = 0.0; for (i = 0; i < NQ; i++) { q[i] = 0.0; } // 9. Get the result DTIMER_START(T_BUFFER_READ); err_code = clEnqueueReadBuffer(cmd_queue, pgq, CL_FALSE, 0, gq_size, gq, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsx, CL_FALSE, 0, gsx_size, gsx, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); err_code = clEnqueueReadBuffer(cmd_queue, pgsy, CL_TRUE, 0, gsy_size, gsy, 0, NULL, NULL); clu_CheckError(err_code, "clEnqueueReadbuffer()"); DTIMER_STOP(T_BUFFER_READ); for (i = 0; i < np/localWorkSize[0]; i++) { for (j = 0; j < NQ; j++ ){ q[j] = q[j] + gq[i][j]; } tsx = tsx + gsx[i]; tsy = tsy + gsy[i]; } for (i = 0; i < NQ; i++) { gc = gc + q[i]; } timer_stop(0); tm = timer_read(0); nit = 0; verified = true; if (M == 24) { sx_verify_value = -3.247834652034740e+3; sy_verify_value = -6.958407078382297e+3; } else if (M == 25) { sx_verify_value = -2.863319731645753e+3; sy_verify_value = -6.320053679109499e+3; } else if (M == 28) { sx_verify_value = -4.295875165629892e+3; sy_verify_value = -1.580732573678431e+4; } else if (M == 30) { sx_verify_value = 4.033815542441498e+4; sy_verify_value = -2.660669192809235e+4; } else if (M == 32) { sx_verify_value = 4.764367927995374e+4; sy_verify_value = -8.084072988043731e+4; } else if (M == 36) { sx_verify_value = 1.982481200946593e+5; sy_verify_value = -1.020596636361769e+5; } else if (M == 40) { sx_verify_value = -5.319717441530e+05; sy_verify_value = -3.688834557731e+05; } else { verified = false; } if (verified) { sx_err = fabs((tsx - sx_verify_value) / sx_verify_value); sy_err = fabs((tsy - sy_verify_value) / sy_verify_value); verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON)); } Mops = pow(2.0, M+1) / tm / 1000000.0; printf("\nEP Benchmark Results:\n\n"); printf("CPU Time =%10.4lf\n", tm); printf("N = 2^%5d\n", M); printf("No. Gaussian Pairs = %15.0lf\n", gc); printf("Sums = %25.15lE %25.15lE\n", tsx, tsy); printf("Counts: \n"); for (i = 0; i < NQ; i++) { printf("%3d%15.0lf\n", i, q[i]); } c_print_results("EP", CLASS, M+1, 0, 0, nit, tm, Mops, "Random numbers generated", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7, clu_GetDeviceTypeName(device_type), device_name); if (timers_enabled) { if (tm <= 0.0) tm = 1.0; tt = timer_read(0); printf("\nTotal time: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm); } free(gq); free(gsx); free(gsy); release_opencl(); fflush(stdout); return 0; }
int main(int argc, char **argv) { int i, j, k, it; int nthreads = 1; double zeta; double rnorm; double norm_temp11; double norm_temp12; double t, mflops; char cclass; boolean verified; double zeta_verify_value, epsilon; firstrow = 1; lastrow = NA; firstcol = 1; lastcol = NA; if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { cclass = 'S'; zeta_verify_value = 8.5971775078648; } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { cclass = 'W'; zeta_verify_value = 10.362595087124; } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { cclass = 'A'; zeta_verify_value = 17.130235054029; } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { cclass = 'B'; zeta_verify_value = 22.712745482631; } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { cclass = 'C'; zeta_verify_value = 28.973605592845; } else { cclass = 'U'; } printf("\n\n NAS Parallel Benchmarks 2.3 OpenMP C version" " - CG Benchmark\n"); printf(" Size: %10d\n", NA); printf(" Iterations: %5d\n", NITER); naa = NA; nzz = NZ; /*-------------------------------------------------------------------- c Initialize random number generator c-------------------------------------------------------------------*/ tran = 314159265.0; amult = 1220703125.0; zeta = randlc( &tran, amult ); /*-------------------------------------------------------------------- c c-------------------------------------------------------------------*/ makea(naa, nzz, a, colidx, rowstr, NONZER, firstrow, lastrow, firstcol, lastcol, RCOND, arow, acol, aelt, v, iv, SHIFT); /*--------------------------------------------------------------------- c Note: as a result of the above call to makea: c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 c values of colidx which are col indexes go from firstcol --> lastcol c So: c Shift the col index vals from actual (firstcol --> lastcol ) c to local, i.e., (1 --> lastcol-firstcol+1) c---------------------------------------------------------------------*/ #pragma omp parallel private(it,i,j,k) { #pragma omp for nowait for (j = 1; j <= lastrow - firstrow + 1; j++) { for (k = rowstr[j]; k < rowstr[j+1]; k++) { colidx[k] = colidx[k] - firstcol + 1; } } /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ #pragma omp for nowait for (i = 1; i <= NA+1; i++) { x[i] = 1.0; } #pragma omp single zeta = 0.0; /*------------------------------------------------------------------- c----> c Do one iteration untimed to init all code and data page tables c----> (then reinit, start timing, to niter its) c-------------------------------------------------------------------*/ for (it = 1; it <= 1; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad (colidx, rowstr, x, z, a, p, q, r, w, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ #pragma omp single { norm_temp11 = 0.0; norm_temp12 = 0.0; } /* end single */ #pragma omp for reduction(+:norm_temp11,norm_temp12) for (j = 1; j <= lastcol-firstcol+1; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } #pragma omp single norm_temp12 = 1.0 / sqrt( norm_temp12 ); /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ #pragma omp for for (j = 1; j <= lastcol-firstcol+1; j++) { x[j] = norm_temp12*z[j]; } } /* end of do one iteration untimed */ /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ #pragma omp for nowait for (i = 1; i <= NA+1; i++) { x[i] = 1.0; } #pragma omp single zeta = 0.0; } /* end parallel */ timer_clear( 1 ); timer_start( 1 ); /*-------------------------------------------------------------------- c----> c Main Iteration for inverse power method c----> c-------------------------------------------------------------------*/ #pragma omp parallel private(it,i,j,k) { for (it = 1; it <= NITER; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad(colidx, rowstr, x, z, a, p, q, r, w, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ #pragma omp single { norm_temp11 = 0.0; norm_temp12 = 0.0; } /* end single */ #pragma omp for reduction(+:norm_temp11,norm_temp12) for (j = 1; j <= lastcol-firstcol+1; j++) { norm_temp11 = norm_temp11 + x[j]*z[j]; norm_temp12 = norm_temp12 + z[j]*z[j]; } #pragma omp single { norm_temp12 = 1.0 / sqrt( norm_temp12 ); zeta = SHIFT + 1.0 / norm_temp11; } /* end single */ #pragma omp master { if( it == 1 ) { printf(" iteration ||r|| zeta\n"); } printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); } /* end master */ /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ #pragma omp for for (j = 1; j <= lastcol-firstcol+1; j++) { x[j] = norm_temp12*z[j]; } } /* end of main iter inv pow meth */ #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop( 1 ); /*-------------------------------------------------------------------- c End of timed section c-------------------------------------------------------------------*/ t = timer_read( 1 ); printf(" Benchmark completed\n"); epsilon = 1.0e-10; if (cclass != 'U') { if (fabs(zeta - zeta_verify_value) <= epsilon) { verified = TRUE; printf(" VERIFICATION SUCCESSFUL\n"); printf(" Zeta is %20.12e\n", zeta); printf(" Error is %20.12e\n", zeta - zeta_verify_value); } else { verified = FALSE; printf(" VERIFICATION FAILED\n"); printf(" Zeta %20.12e\n", zeta); printf(" The correct zeta is %20.12e\n", zeta_verify_value); } } else { verified = FALSE; printf(" Problem size unknown\n"); printf(" NO VERIFICATION PERFORMED\n"); } if ( t != 0.0 ) { mflops = (2.0*NITER*NA) * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 ) / t / 1000000.0; } else { mflops = 0.0; } c_print_results("CG", cclass, NA, 0, 0, NITER, nthreads, t, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); }
int main(int argc, char **argv) { /*c------------------------------------------------------------------- c-------------------------------------------------------------------*/ int i, ierr; /*------------------------------------------------------------------ c u0, u1, u2 are the main arrays in the problem. c Depending on the decomposition, these arrays will have different c dimensions. To accomodate all possibilities, we allocate them as c one-dimensional arrays and pass them to subroutines for different c views c - u0 contains the initial (transformed) initial condition c - u1 and u2 are working arrays c - indexmap maps i,j,k of u0 to the correct i^2+j^2+k^2 for the c time evolution operator. c-----------------------------------------------------------------*/ /*-------------------------------------------------------------------- c Large arrays are in common so that they are allocated on the c heap rather than the stack. This common block is not c referenced directly anywhere else. Padding is to avoid accidental c cache problems, since all array sizes are powers of two. c-------------------------------------------------------------------*/ static dcomplex u0[NZ][NY][NX]; static dcomplex pad1[3]; static dcomplex u1[NZ][NY][NX]; static dcomplex pad2[3]; static dcomplex u2[NZ][NY][NX]; static dcomplex pad3[3]; static int indexmap[NZ][NY][NX]; int iter; int nthreads = 1; double total_time, mflops; boolean verified; char cclass; /*-------------------------------------------------------------------- c Run the entire problem once to make sure all data is touched. c This reduces variable startup costs, which is important for such a c short benchmark. The other NPB 2 implementations are similar. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } setup(); #pragma omp parallel { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } fft(1, u1, u0); } /* end parallel */ /*-------------------------------------------------------------------- c Start over from the beginning. Note that all operations must c be timed, in contrast to other benchmarks. c-------------------------------------------------------------------*/ for (i = 0; i < T_MAX; i++) { timer_clear(i); } timer_start(T_TOTAL); if (TIMERS_ENABLED == TRUE) timer_start(T_SETUP); #pragma omp parallel private(iter) firstprivate(niter) { compute_indexmap(indexmap, dims[2]); #pragma omp single { compute_initial_conditions(u1, dims[0]); fft_init (dims[0][0]); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_SETUP); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(1, u1, u0); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } for (iter = 1; iter <= niter; iter++) { if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_EVOLVE); } evolve(u0, u1, iter, indexmap, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_EVOLVE); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_FFT); } fft(-1, u1, u2); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_FFT); } if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_start(T_CHECKSUM); } checksum(iter, u2, dims[0]); if (TIMERS_ENABLED == TRUE) { #pragma omp master timer_stop(T_CHECKSUM); } } #pragma omp single verify(NX, NY, NZ, niter, &verified, &cclass); #if defined(_OPENMP) #pragma omp master nthreads = omp_get_num_threads(); #endif /* _OPENMP */ } /* end parallel */ timer_stop(T_TOTAL); total_time = timer_read(T_TOTAL); if( total_time != 0.0) { mflops = 1.0e-6*(double)(NTOTAL) * (14.8157+7.19641*log((double)(NTOTAL)) + (5.23518+7.21113*log((double)(NTOTAL)))*niter) /total_time; } else { mflops = 0.0; } c_print_results("FT", cclass, NX, NY, NZ, niter, nthreads, total_time, mflops, " floating point", verified, NPBVERSION, COMPILETIME, CS1, CS2, CS3, CS4, CS5, CS6, CS7); if (TIMERS_ENABLED == TRUE) print_timers(); }