void cStream::outputSummary() { register int j; cout << "Function Rate (MB/s) Avg time Min time Max time" << endl; for ( j = eCopy; j <= eTriad; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); cout << label[j] << fixed << setw( 14 ) << setprecision( 3 ) << ( 1.0E-06 * bytes[j] ) / ( mintime[j] / freq ) << setw( 11 ) << avgtime[j] / freq << setw( 13 ) << mintime[j] / freq << setw( 13 ) << maxtime[j] / freq << endl; } cout << HLINE; /* --- Check Results --- */ checkSTREAMresults(); cout << HLINE; }; // void cStream::outputSummary()
void cStream::runBenchmarkTuned() { initializeVariables(); runChecks(); runTunedTests(); checkSTREAMresults(); calculateBandwidthResults(); outputSummary(); }; // void cStream::runBenchmarkTuned()
int main(int argc, char **argv) { int num_nodes = xmp_num_nodes(); /* Set parameters */ if(argc != 2){ #pragma xmp task on p(1) fprintf(stderr, "./STREAM (number of vector).\ne.g../STREAM 1000\n"); return 1; } array_elements = atoi(argv[1]); /* Malloc arrays */ a = malloc(sizeof(double) * array_elements); b = malloc(sizeof(double) * array_elements); c = malloc(sizeof(double) * array_elements); /* Initialize arrays */ #pragma omp parallel for for(int j=0;j<array_elements;j++){ a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } /* Execute STREAM */ double triadGBs = HPCC_Stream(); #pragma xmp task on p(1) printf("[Vector size is %d] Total Triad %.4f GB/s on %d nodes\n", array_elements, triadGBs, num_nodes); #include <omp.h> #pragma xmp task on p(1) { #pragma omp parallel { #pragma omp single printf("Number of Threads requested = %d\n", omp_get_num_threads()); } } /* Verification */ checkSTREAMresults(num_nodes); return 0; }
int HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs, double *triadGBs, int *failure) { int quantum; int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; FILE *outFile; double GiBs = 1073741824.0, curGBs; if (doIO) { // outFile = fopen( params->outFname, "w+" ); outFile = stdout; if (! outFile) { outFile = stderr; fprintf( outFile, "Cannot open output file.\n" ); return 1; } } // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */ // HARDCODED VectorSize // params->StreamVectorSize = VectorSize; a = HPCC_XMALLOC( double, VectorSize ); b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); if (!a || !b || !c) { if (c) HPCC_free(c); if (b) HPCC_free(b); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize ); fflush( outFile ); fclose( outFile ); } return 1; } /* --- SETUP --- determine precision and check timing --- */ if (doIO) { fprintf (outFile, "Generated on %s\n", params->nowASCII); fprintf( outFile, HLINE); BytesPerWord = sizeof(double); fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); fprintf( outFile, HLINE); fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET); fprintf( outFile, "Total memory required = %.4f GiB.\n", (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs)); fprintf( outFile, "Each test is run %d times, but only\n", NTIMES); fprintf( outFile, "the *best* time for each is used.\n"); fflush ( outFile); } #ifdef _OPENMP if (doIO) fprintf( outFile, HLINE); #pragma omp parallel private(k) { #pragma omp single nowait { k = omp_get_num_threads(); if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k); params->StreamThreads = k; } } #endif /* Get initial value for system clock. */ #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } if (doIO) fprintf( outFile, HLINE); if ( (quantum = checktick()) >= 1) { if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); } else { if (doIO) fprintf( outFile, "Your clock granularity appears to be " "less than one microsecond.\n"); } t = mysecond(); #ifdef _OPENMP #pragma omp parallel for #endif for (j = 0; j < VectorSize; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); if (doIO) { fprintf( outFile, "Each test below will take on the order" " of %d microseconds.\n", (int) t ); fprintf( outFile, " (= %d clock ticks)\n", (int) (t/quantum) ); fprintf( outFile, "Increase the size of the arrays if this shows that\n"); fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n"); fprintf( outFile, HLINE); fprintf( outFile, "WARNING -- The above is only a rough guideline.\n"); fprintf( outFile, "For best results, please be sure you know the\n"); fprintf( outFile, "precision of your system timer.\n"); fprintf( outFile, HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = Mmin(mintime[j], times[j][k]); maxtime[j] = Mmax(maxtime[j], times[j][k]); } } if (doIO) fprintf( outFile, "Function Rate (GB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */ /* make sure no division by zero */ curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0); curGBs *= 1e-9 * bytes[j] * VectorSize; if (doIO) fprintf( outFile, "%s%11.4f %11.4f %11.4f %11.4f\n", label[j], curGBs, avgtime[j], mintime[j], maxtime[j]); switch (j) { case 0: *copyGBs = curGBs; break; case 1: *scaleGBs = curGBs; break; case 2: *addGBs = curGBs; break; case 3: *triadGBs = curGBs; break; } } if (doIO) fprintf( outFile, HLINE); /* --- Check Results --- */ checkSTREAMresults( outFile, doIO, failure ); if (doIO) fprintf( outFile, HLINE); HPCC_free(c); HPCC_free(b); HPCC_free(a); if (doIO) { fflush( outFile ); fclose( outFile ); } return 0; }
int main() { int checktick(void); int quantum; int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.10 $\n"); printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); return 0; }
int main(int argc,char *argv[]) { PetscErrorCode ierr; int quantum, checktick(); int BytesPerWord; int j, k; double scalar=3.0, t, times[4][NTIMES]; PetscInitialize(&argc,&argv,0,help); /* --- SETUP --- determine precision and check timing --- */ /*printf(HLINE); printf("STREAM version $Revision: 5.9 $\n"); printf(HLINE); */ BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord); printf(HLINE); #if defined(NO_LONG_LONG) printf("Array size = %d, Offset = %d\n", N, OFFSET); #else printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); #endif printf("Total memory required = %.1f MB.\n",(3.0 * BytesPerWord) * ((double)N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); printf(HLINE); #if !STATIC_ALLOC a = malloc((N+OFFSET)*sizeof(double)); b = malloc((N+OFFSET)*sizeof(double)); c = malloc((N+OFFSET)*sizeof(double)); #endif #if WITH_PTHREADS ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nworkThreads);CHKERRQ(ierr); ierr = PetscMalloc((nworkThreads+1)*sizeof(PetscInt),&trstarts);CHKERRQ(ierr); PetscInt Q,R,nloc; PetscBool S; Q = (N+OFFSET)/nworkThreads; R = (N+OFFSET) - Q*nworkThreads; trstarts[0] = 0; for (j=0; j < nworkThreads; j++) { S = (PetscBool)(j < R); nloc = S ? Q+1 : Q; trstarts[j+1] = trstarts[j]+nloc; } ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Initialize_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); # else for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } #endif /*printf(HLINE);*/ /* Get initial value for system clock. */ if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ else quantum = 1; /* printf("Your clock granularity appears to be less than one microsecond.\n"); */ t = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_2A_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; #endif t = 1.0E6 * (mysecond() - t); /* printf("Each test below will take on the order of %d microseconds.\n", (int)t); printf(" (= %d clock ticks)\n", (int) (t/quantum)); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); */ /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Copy_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Scale_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Add_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Triad_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } printf("Function Rate (MB/s) \n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f \n", label[j], 1.0E-06 * bytes[j]/mintime[j]); } /* printf(HLINE);*/ #if WITH_PTHREADS ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #endif /* --- Check Results --- */ checkSTREAMresults(); /* printf(HLINE);*/ PetscFinalize(); return 0; }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel private(k) { // k = omp_get_num_threads(); // printf ("Number of Threads requested = %i\n",k); } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) { // printf("Your clock granularity/precision appears to be " // "%d microseconds.\n", quantum); } else { // printf("Your clock granularity appears to be " // "less than one microsecond.\n"); } t = mysecond(); #pragma omp parallel for for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); // printf("Each test below will take on the order" // " of %d microseconds.\n", (int) t ); // printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); /* printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);*/ } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); return 0; }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; #ifdef MAI mai_init(NULL); a = mai_alloc_1D(N, sizeof(double),DOUBLE); b = mai_alloc_1D(N, sizeof(double),DOUBLE); c = mai_alloc_1D(N, sizeof(double),DOUBLE); mai_bind_columns(a); mai_bind_columns(b); mai_bind_columns(c); #else a = malloc(N*sizeof(double)); b = malloc(N*sizeof(double)); c = malloc(N*sizeof(double)); #endif /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.9 $\n"); printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); #ifdef MAI mai_cyclic(a); mai_cyclic(b); mai_cyclic(c); #endif int chunk = 128; t = mysecond(); #pragma omp parallel for schedule(dynamic,chunk) for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); return 0; }
int main() { int quantum, checktick(); int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ BytesPerWord = sizeof(STREAM_TYPE); #ifdef _OPENMP #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } if ( (quantum = checktick()) >= 1) printf(""); else { quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } for (j=0; j<1; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%11.6f\n", 1.0E-06 * bytes[j]/mintime[j]); } /* --- Check Results --- */ checkSTREAMresults(); return 0; }
int main () { int quantum = -1, checktick (); int BytesPerWord; int k; ssize_t j, i; STREAM_TYPE scalar; // process local counters int count_p = 0, next_p = 0; gcounter = 0; /* --- SETUP --- determine precision and check timing --- */ printf (HLINE); printf ("STREAM version $Revision: 5.10 $\n"); printf (HLINE); BytesPerWord = sizeof (STREAM_TYPE); printf ("This system uses %d bytes per array element.\n", BytesPerWord); /* SHMEM initialize */ start_pes (0); _world_size = _num_pes (); _world_rank = _my_pe (); /* wait for user to input runtime params */ for (int j = 0; j < _SHMEM_BARRIER_SYNC_SIZE; j++) { pSync0[j] = pSync1[j] = pSync2[j] = _SHMEM_SYNC_VALUE; } if (_world_rank == 0) { printf (HLINE); #ifdef N printf ("***** WARNING: ******\n"); printf (" It appears that you set the preprocessor variable N when compiling this code.\n"); printf (" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf (" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n", (unsigned long long) STREAM_ARRAY_SIZE); printf ("***** WARNING: ******\n"); #endif printf ("Array size = %llu (elements), Offset = %d (elements)\n", (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf ("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0), BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0 / 1024.0)); printf ("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.), (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024. / 1024.)); printf ("Each kernel will be executed %d times.\n", NTIMES); printf (" The *best* time for each kernel (excluding the first iteration)\n"); printf (" will be used to compute the reported bandwidth.\n"); printf ("Number of SHMEM PEs requested = %i\n", _world_size); } int blocksize = 10000; assert (STREAM_ARRAY_SIZE % blocksize == 0); // do something really minor /* Get initial value for system clock. */ for (j = 0; j < STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf (HLINE); if (_world_rank == 0) { if ((quantum = checktick ()) >= 1) printf ("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf ("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } } // assign fixed iterations per PE // since we know default STREAM array size // we are hardcoding this, but if the value // changes, then this blocking factor must // also change // basically, each PE works on this block // size at a time time_start = mysecond (); /* Initialize */ next_p = shmem_int_fadd (&gcounter, 1, ROOT); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = 2.0E0 * a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; } time_end = mysecond (); clock_time_PE = time_end - time_start; shmem_double_sum_to_all (&total_clock_time, &clock_time_PE, 1, 0, 0, _world_size, pWrk0, pSync0); if (_world_rank == 0) { printf ("Each test below will take on the order" " of %d microseconds.\n", (int) (total_clock_time * 1.0E6)); printf (" (= %d clock ticks)\n", (int) ((1.0E6 * total_clock_time) / quantum)); printf ("Increase the size of the arrays if this shows that\n"); printf ("you are not getting at least 20 clock ticks per test.\n"); printf (HLINE); printf ("WARNING -- The above is only a rough guideline.\n"); printf ("For best results, please be sure you know the\n"); printf ("precision of your system timer.\n"); printf (HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ // reduction required, as each PE only fills a,b,c partially scalar = 3.0; for (k = 0; k < NTIMES; k++) { // this is required for correctness // for NTIMES > 1 which is typically // the case for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = 1.0; b[i] = 2.0; c[i] = 0.0; a[i] = 2.0E0 * a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (a + j, a + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { c[i] = a[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (c + j, c + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_max_to_all (×[0][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { b[i] = scalar * c[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (b + j, b + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[1][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { c[i] = a[i] + b[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (c + j, c + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[2][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); time_start = mysecond (); for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize) { if (next_p == count_p) { for (i = j; i < (j + blocksize); i++) { a[i] = b[i] + scalar * c[i]; } next_p = shmem_int_fadd (&gcounter, 1, ROOT); } count_p++; shmem_double_max_to_all (a + j, a + j, blocksize, 0, 0, _world_size, pWrk1, pSync1); } shmem_barrier_all (); time_end = mysecond () - time_start; shmem_double_sum_to_all (×[3][k], &time_end, 1, 0, 0, _world_size, pWrk0, pSync0); } shmem_barrier_all (); /* --- SUMMARY --- */ for (k = 1; k < NTIMES; k++) /* note -- skip first iteration */ { for (j = 0; j < 4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN (mintime[j], times[j][k]); maxtime[j] = MAX (maxtime[j], times[j][k]); } } if (_world_rank == 0) { printf ("Function Best Rate MB/s Avg time Min time Max time\n"); for (j = 0; j < 4; j++) { avgtime[j] = avgtime[j] / (double) (NTIMES - 1); printf ("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j] / mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf (HLINE); } /* --- Check Results --- */ if (_world_rank == 0) { checkSTREAMresults (); printf (HLINE); } return 0; }
int main() { int quantum, checktick(); int BytesPerWord; int i,k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; double *TimesByRank; double t0,t1,tmin; int rc, numranks, myrank; STREAM_TYPE AvgError[3] = {0.0,0.0,0.0}; STREAM_TYPE *AvgErrByRank; /* --- SETUP --- call MPI_Init() before anything else! --- */ rc = MPI_Init(NULL, NULL); t0 = MPI_Wtime(); if (rc != MPI_SUCCESS) { printf("ERROR: MPI Initialization failed with return code %d\n",rc); exit(1); } // if either of these fail there is something really screwed up! MPI_Comm_size(MPI_COMM_WORLD, &numranks); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */ array_elements = STREAM_ARRAY_SIZE / numranks; // don't worry about rounding vs truncation array_alignment = 64; // Can be modified -- provides partial support for adjusting relative alignment // Dynamically allocate the three arrays using "posix_memalign()" // NOTE that the OFFSET parameter is not used in this version of the code! array_bytes = array_elements * sizeof(STREAM_TYPE); k = posix_memalign((void **)&a, array_alignment, array_bytes); if (k != 0) { printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k); MPI_Abort(MPI_COMM_WORLD, 2); exit(1); } k = posix_memalign((void **)&b, array_alignment, array_bytes); if (k != 0) { printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k); MPI_Abort(MPI_COMM_WORLD, 2); exit(1); } k = posix_memalign((void **)&c, array_alignment, array_bytes); if (k != 0) { printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k); MPI_Abort(MPI_COMM_WORLD, 2); exit(1); } // Initial informational printouts -- rank 0 handles all the output if (myrank == 0) { printf(HLINE); printf("STREAM version $Revision: 1.7 $\n"); printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif if (OFFSET != 0) { printf("***** WARNING: ******\n"); printf(" This version ignores the OFFSET parameter.\n"); printf("***** WARNING: ******\n"); } printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE); printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Data is distributed across %d MPI ranks\n",numranks); printf(" Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements); printf(" Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) array_elements / 1024.0/1024.0), BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0)); printf(" Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.)); printf(HLINE); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); printf("The SCALAR value used for this run is %f\n",SCALAR); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested for each MPI rank = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted for rank 0 = %i\n",k); #endif } /* --- SETUP --- initialize arrays and estimate precision of timer --- */ #pragma omp parallel for for (j=0; j<array_elements; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } // Rank 0 needs to allocate arrays to hold error data and timing data from // all ranks for analysis and output. // Allocate and instantiate the arrays here -- after the primary arrays // have been instantiated -- so there is no possibility of having these // auxiliary arrays mess up the NUMA placement of the primary arrays. if (myrank == 0) { // There are 3 average error values for each rank (using STREAM_TYPE). AvgErrByRank = (double *) malloc(3 * sizeof(STREAM_TYPE) * numranks); if (AvgErrByRank == NULL) { printf("Ooops -- allocation of arrays to collect errors on MPI rank 0 failed\n"); MPI_Abort(MPI_COMM_WORLD, 2); } memset(AvgErrByRank,0,3*sizeof(STREAM_TYPE)*numranks); // There are 4*NTIMES timing values for each rank (always doubles) TimesByRank = (double *) malloc(4 * NTIMES * sizeof(double) * numranks); if (TimesByRank == NULL) { printf("Ooops -- allocation of arrays to collect timing data on MPI rank 0 failed\n"); MPI_Abort(MPI_COMM_WORLD, 3); } memset(TimesByRank,0,4*NTIMES*sizeof(double)*numranks); } // Simple check for granularity of the timer being used if (myrank == 0) { printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your timer granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your timer granularity appears to be " "less than one microsecond.\n"); quantum = 1; } } /* Get initial timing estimate to compare to timer granularity. */ /* All ranks need to run this code since it changes the values in array a */ t = MPI_Wtime(); #pragma omp parallel for for (j = 0; j < array_elements; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (MPI_Wtime() - t); if (myrank == 0) { printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d timer ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 timer ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); #ifdef VERBOSE t1 = MPI_Wtime(); printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0); printf(HLINE); #endif } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ // This code has more barriers and timing calls than are actually needed, but // this should not cause a problem for arrays that are large enough to satisfy // the STREAM run rules. scalar = SCALAR; for (k=0; k<NTIMES; k++) { // kernel 1: Copy MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<array_elements; j++) c[j] = a[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[0][k] = t1 - t0; // kernel 2: Scale MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<array_elements; j++) b[j] = scalar*c[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[1][k] = t1-t0; // kernel 3: Add MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<array_elements; j++) c[j] = a[j]+b[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[2][k] = t1-t0; // kernel 4: Triad MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<array_elements; j++) a[j] = b[j]+scalar*c[j]; #endif MPI_Barrier(MPI_COMM_WORLD); t1 = MPI_Wtime(); times[3][k] = t1-t0; } t0 = MPI_Wtime(); /* --- SUMMARY --- */ // Because of the MPI_Barrier() calls, the timings from any thread are equally valid. // The best estimate of the maximum performance is the minimum of the "outside the barrier" // timings across all the MPI ranks. // Gather all timing data to MPI rank 0 MPI_Gather(times, 4*NTIMES, MPI_DOUBLE, TimesByRank, 4*NTIMES, MPI_DOUBLE, 0, MPI_COMM_WORLD); // Rank 0 processes all timing data if (myrank == 0) { // for each iteration and each kernel, collect the minimum time across all MPI ranks // and overwrite the rank 0 "times" variable with the minimum so the original post- // processing code can still be used. for (k=0; k<NTIMES; k++) { for (j=0; j<4; j++) { tmin = 1.0e36; for (i=0; i<numranks; i++) { // printf("DEBUG: Timing: iter %d, kernel %lu, rank %d, tmin %f, TbyRank %f\n",k,j,i,tmin,TimesByRank[4*NTIMES*i+j*NTIMES+k]); tmin = MIN(tmin, TimesByRank[4*NTIMES*i+j*NTIMES+k]); } // printf("DEBUG: Final Timing: iter %d, kernel %lu, final tmin %f\n",k,j,tmin); times[j][k] = tmin; } } // Back to the original code, but now using the minimum global timing across all ranks for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } // note that "bytes[j]" is the aggregate array size, so no "numranks" is needed here printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); } /* --- Every Rank Checks its Results --- */ #ifdef INJECTERROR a[11] = 100.0 * a[11]; #endif computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]); /* --- Collect the Average Errors for Each Array on Rank 0 --- */ MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD); /* -- Combined averaged errors and report on Rank 0 only --- */ if (myrank == 0) { #ifdef VERBOSE for (k=0; k<numranks; k++) { printf("VERBOSE: rank %d, AvgErrors %e %e %e\n",k,AvgErrByRank[3*k+0], AvgErrByRank[3*k+1],AvgErrByRank[3*k+2]); } #endif checkSTREAMresults(AvgErrByRank,numranks); printf(HLINE); } #ifdef VERBOSE if (myrank == 0) { t1 = MPI_Wtime(); printf("VERBOSE: total shutdown time for rank %d = %f seconds\n",myrank,t1-t0); } #endif free(a); free(b); free(c); if (myrank == 0) { free(TimesByRank); free(AvgErrByRank); } MPI_Finalize(); return(0); }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.9 $\n"); printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); printf(HLINE); /* Get initial value for system clock. */ for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); else { printf("Your clock granularity appears to be less than one microsecond.\n"); quantum = 1; } t = mysecond(); for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ printf("--- MAIN LOOP --- repeat test cases %d times ---\n", NTIMES); printf(HLINE); scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); for (j=0; j<N; j++) c[j] = a[j]; times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); for (j=0; j<N; j++) b[j] = scalar*c[j]; times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); for (j=0; j<N; j++) c[j] = a[j]+b[j]; times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; times[3][k] = mysecond() - times[3][k]; // printf("Iteration(%i): Copy(%.4f), Scale(%.4f), Add(%.4f), Triad(%.4f)\n", // k, times[0][k], times[1][k], times[2][k], times[3][k]); } /* --- SUMMARY --- */ printf("--- SUMMARY ---\n"); printf(HLINE); for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ printf("--- Check Results ---\n"); printf(HLINE); checkSTREAMresults(); printf(HLINE); return 0; }
int main(int argc, char **argv) { int quantum, checktick(); int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; #ifdef ENABLE_DYNAMIC_ALLOC int err = 0; memkind_t kind; char err_msg[ERR_MSG_SIZE]; if (argc > 1 && (strncmp("--help", argv[1], strlen("--help")) == 0 || strncmp("-h", argv[1], strlen("-h")) == 0)) { printf("Usage: %s [memkind_default | memkind_hbw | memkind_hbw_hugetlb | \n" " memkind_hbw_preferred | memkind_hbw_preferred_hugetlb | \n" " memkind_hbw_gbtlb | memkind_hbw_preferred_gbtlb | memkind_gbtlb | \n" " memkind_hbw_interleave | memkind_interleave]\n", argv[0]); return 0; } #endif /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.10 $\n"); #ifdef ENABLE_DYNAMIC_ALLOC printf("Variant that uses the memkind library for dynamic memory allocation.\n"); #endif printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); #endif #ifdef ENABLE_DYNAMIC_ALLOC if (argc > 1) { err = memkind_get_kind_by_name(argv[1], &kind); } else { err = memkind_get_kind_by_name("memkind_default", &kind); } if (err) { memkind_error_message(err, err_msg, ERR_MSG_SIZE); fprintf(stderr, "ERROR: %s\n", err_msg); return -1; } err = memkind_posix_memalign(kind, (void **)&a, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET)); if (err) { fprintf(stderr, "ERROR: Unable to allocate stream array a\n"); return -err; } err = memkind_posix_memalign(kind, (void **)&b, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET)); if (err) { fprintf(stderr, "ERROR: Unable to allocate stream array b\n"); return -err; } err = memkind_posix_memalign(kind, (void **)&c, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET)); if (err) { fprintf(stderr, "ERROR: Unable to allocate stream array c\n"); return -err; } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); #ifdef ENABLE_DYNAMIC_ALLOC memkind_free(kind, c); memkind_free(kind, b); memkind_free(kind, a); #endif return 0; }