예제 #1
0
void cStream::outputSummary()
{

  register int j;
  cout << "Function      Rate (MB/s)   Avg time     Min time     Max time"
       << endl;
  for ( j = eCopy; j <= eTriad; j++)
  {
	  avgtime[j] = avgtime[j]/(double)(NTIMES-1);

    cout << label[j]
         << fixed
         << setw( 14 )
         << setprecision( 3 )
         << ( 1.0E-06 * bytes[j] ) / ( mintime[j] / freq )
         << setw( 11 )
	       << avgtime[j] / freq
         << setw( 13 )
	       << mintime[j] / freq
         << setw( 13 )
	       << maxtime[j] / freq
         << endl;
    }
    cout << HLINE;

    /* --- Check Results --- */
    checkSTREAMresults();
    cout << HLINE;
}; // void cStream::outputSummary()
예제 #2
0
void cStream::runBenchmarkTuned()
{
  initializeVariables();
  runChecks();
  runTunedTests();
  checkSTREAMresults();
  calculateBandwidthResults();
  outputSummary();
}; // void cStream::runBenchmarkTuned()
예제 #3
0
int main(int argc, char **argv)
{
  int num_nodes = xmp_num_nodes();

  /* Set parameters */
  if(argc != 2){
#pragma xmp task on p(1)
    fprintf(stderr, "./STREAM (number of vector).\ne.g../STREAM 1000\n");
    return 1;
  }
  array_elements = atoi(argv[1]);

  /* Malloc arrays */
  a = malloc(sizeof(double) * array_elements);
  b = malloc(sizeof(double) * array_elements);
  c = malloc(sizeof(double) * array_elements);

  /* Initialize arrays */
#pragma omp parallel for
  for(int j=0;j<array_elements;j++){
    a[j] = 1.0;
    b[j] = 2.0;
    c[j] = 0.0;
  }

  /* Execute STREAM */
  double triadGBs = HPCC_Stream();

#pragma xmp task on p(1)
  printf("[Vector size is %d] Total Triad %.4f GB/s on %d nodes\n", 
	 array_elements, triadGBs, num_nodes);

#include <omp.h>
#pragma xmp task on p(1)
  {
#pragma omp parallel
    {
#pragma omp single
      printf("Number of Threads requested = %d\n", omp_get_num_threads());
    }
  }
  
  /* Verification */
  checkSTREAMresults(num_nodes);

  return 0;
}
예제 #4
0
int
HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs,
  double *triadGBs, int *failure) {
    int   quantum;
    int   BytesPerWord;
    register int j, k;
    double  scalar, t, times[4][NTIMES];
    FILE *outFile;
    double GiBs = 1073741824.0, curGBs;

    if (doIO) {
      // outFile = fopen( params->outFname, "w+" );
	  outFile = stdout;
      if (! outFile) {
        outFile = stderr;
        fprintf( outFile, "Cannot open output file.\n" );
        return 1;
      }
    }

    // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */
	// HARDCODED VectorSize
    // params->StreamVectorSize = VectorSize;

    a = HPCC_XMALLOC( double, VectorSize );
    b = HPCC_XMALLOC( double, VectorSize );
    c = HPCC_XMALLOC( double, VectorSize );

    if (!a || !b || !c) {
      if (c) HPCC_free(c);
      if (b) HPCC_free(b);
      if (a) HPCC_free(a);
      if (doIO) {
        fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize );
        fflush( outFile );
        fclose( outFile );
      }
      return 1;
    }

    /* --- SETUP --- determine precision and check timing --- */

    if (doIO) {
    fprintf (outFile, "Generated on %s\n", params->nowASCII);
    fprintf( outFile, HLINE);
    BytesPerWord = sizeof(double);
    fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n",
             BytesPerWord);

    fprintf( outFile, HLINE);
    fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET);
    fprintf( outFile, "Total memory required = %.4f GiB.\n",
             (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs));
    fprintf( outFile, "Each test is run %d times, but only\n", NTIMES);
    fprintf( outFile, "the *best* time for each is used.\n");
	fflush ( outFile);
    }

#ifdef _OPENMP
    if (doIO) fprintf( outFile, HLINE);
#pragma omp parallel private(k)
    {
#pragma omp single nowait
      {
        k = omp_get_num_threads();
        if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k);
        params->StreamThreads = k;
      }
    }
#endif

    /* Get initial value for system clock. */
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j=0; j<VectorSize; j++) {
      a[j] = 1.0;
      b[j] = 2.0;
      c[j] = 0.0;
    }

    if (doIO) fprintf( outFile, HLINE);

    if  ( (quantum = checktick()) >= 1) {
      if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be "
                         "%d microseconds.\n", quantum);
    } else {
      if (doIO) fprintf( outFile, "Your clock granularity appears to be "
                         "less than one microsecond.\n");
    }

    t = mysecond();
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j = 0; j < VectorSize; j++)
      a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    if (doIO) {
    fprintf( outFile, "Each test below will take on the order"
             " of %d microseconds.\n", (int) t  );
    fprintf( outFile, "   (= %d clock ticks)\n", (int) (t/quantum) );
    fprintf( outFile, "Increase the size of the arrays if this shows that\n");
    fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n");

    fprintf( outFile, HLINE);

    fprintf( outFile, "WARNING -- The above is only a rough guideline.\n");
    fprintf( outFile, "For best results, please be sure you know the\n");
    fprintf( outFile, "precision of your system timer.\n");
    fprintf( outFile, HLINE);
    }

    /* --- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
    {
      times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j];
#endif
        times[0][k] = mysecond() - times[0][k];

        times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          b[j] = scalar*c[j];
#endif
        times[1][k] = mysecond() - times[1][k];

        times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j]+b[j];
#endif
        times[2][k] = mysecond() - times[2][k];

        times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          a[j] = b[j]+scalar*c[j];
#endif
        times[3][k] = mysecond() - times[3][k];
    }

    /* --- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
    {
      for (j=0; j<4; j++)
      {
        avgtime[j] = avgtime[j] + times[j][k];
        mintime[j] = Mmin(mintime[j], times[j][k]);
        maxtime[j] = Mmax(maxtime[j], times[j][k]);
      }
    }

    if (doIO)
    fprintf( outFile, "Function      Rate (GB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
      avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */

      /* make sure no division by zero */
      curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0);
      curGBs *= 1e-9 * bytes[j] * VectorSize;
        if (doIO)
          fprintf( outFile, "%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
                   curGBs,
                   avgtime[j],
                   mintime[j],
                   maxtime[j]);
        switch (j) {
          case 0: *copyGBs = curGBs; break;
          case 1: *scaleGBs = curGBs; break;
          case 2: *addGBs = curGBs; break;
          case 3: *triadGBs = curGBs; break;
        }
    }
    if (doIO) fprintf( outFile, HLINE);

    /* --- Check Results --- */
    checkSTREAMresults( outFile, doIO, failure );
    if (doIO) fprintf( outFile, HLINE);

    HPCC_free(c);
    HPCC_free(b);
    HPCC_free(a);

    if (doIO) {
      fflush( outFile );
      fclose( outFile );
    }

    return 0;
}
예제 #5
0
int
main()
{
    int     checktick(void);
    int			quantum;
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.10 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(STREAM_TYPE);
    printf("This system uses %d bytes per array element.\n",
	BytesPerWord);

    printf(HLINE);
#ifdef N
    printf("*****  WARNING: ******\n");
    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
    printf("*****  WARNING: ******\n");
#endif

    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
    printf("Each kernel will be executed %d times.\n", NTIMES);
    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
    printf(" will be used to compute the reported bandwidth.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
    printf ("Number of Threads counted = %i\n",k);
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
예제 #6
0
파일: ex2.c 프로젝트: feelpp/debian-petsc
int main(int argc,char *argv[])
{
  PetscErrorCode ierr;
  int            quantum, checktick();
  int            BytesPerWord;
  int            j, k;
  double         scalar=3.0, t, times[4][NTIMES];

  PetscInitialize(&argc,&argv,0,help);
  /* --- SETUP --- determine precision and check timing --- */

  /*printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE); */
  BytesPerWord = sizeof(double);
  printf("This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord);

  printf(HLINE);
#if defined(NO_LONG_LONG)
  printf("Array size = %d, Offset = %d\n", N, OFFSET);
#else
  printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET);
#endif

  printf("Total memory required = %.1f MB.\n",(3.0 * BytesPerWord) * ((double)N / 1048576.0));
  printf("Each test is run %d times, but only\n", NTIMES);
  printf("the *best* time for each is used.\n");

  printf(HLINE);

#if !STATIC_ALLOC
  a = malloc((N+OFFSET)*sizeof(double));
  b = malloc((N+OFFSET)*sizeof(double));
  c = malloc((N+OFFSET)*sizeof(double));
#endif

#if WITH_PTHREADS
  ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nworkThreads);CHKERRQ(ierr);
  ierr = PetscMalloc((nworkThreads+1)*sizeof(PetscInt),&trstarts);CHKERRQ(ierr);
  PetscInt  Q,R,nloc;
  PetscBool S;
  Q           = (N+OFFSET)/nworkThreads;
  R           = (N+OFFSET) - Q*nworkThreads;
  trstarts[0] = 0;
  for (j=0; j < nworkThreads; j++) {
    S             = (PetscBool)(j < R);
    nloc          = S ? Q+1 : Q;
    trstarts[j+1] = trstarts[j]+nloc;
  }
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Initialize_Kernel,1,&scalar);CHKERRQ(ierr);
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
# else
  for (j=0; j<N; j++) {
    a[j] = 1.0;
    b[j] = 2.0;
    c[j] = 0.0;
  }
#endif

  /*printf(HLINE);*/

  /* Get initial value for system clock. */
  if  ((quantum = checktick()) >= 1) ;
  /*      printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
  else quantum = 1;
  /*   printf("Your clock granularity appears to be less than one microsecond.\n"); */

  t = mysecond();

#if WITH_PTHREADS
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_2A_Kernel,0);CHKERRQ(ierr);
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
  for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
#endif
  t = 1.0E6 * (mysecond() - t);

  /*    printf("Each test below will take on the order of %d microseconds.\n", (int)t);
  printf("   (= %d clock ticks)\n", (int) (t/quantum));
  printf("Increase the size of the arrays if this shows that\n");
  printf("you are not getting at least 20 clock ticks per test.\n");

  printf(HLINE);
  */
  /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */

  for (k=0; k<NTIMES; k++) {
    times[0][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Copy_Kernel,0);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) c[j] = a[j];
#endif
    times[0][k] = mysecond() - times[0][k];

    times[1][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Scale_Kernel,1,&scalar);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) b[j] = scalar*c[j];
#endif
    times[1][k] = mysecond() - times[1][k];

    times[2][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Add_Kernel,0);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) c[j] = a[j]+b[j];
#endif
    times[2][k] = mysecond() - times[2][k];

    times[3][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Triad_Kernel,1,&scalar);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
#endif
    times[3][k] = mysecond() - times[3][k];
  }

  /*  --- SUMMARY --- */

  for (k=1; k<NTIMES; k++)     /* note -- skip first iteration */
    for (j=0; j<4; j++) {
      avgtime[j] = avgtime[j] + times[j][k];
      mintime[j] = MIN(mintime[j], times[j][k]);
      maxtime[j] = MAX(maxtime[j], times[j][k]);
    }

  printf("Function      Rate (MB/s) \n");
  for (j=0; j<4; j++) {
    avgtime[j] = avgtime[j]/(double)(NTIMES-1);

    printf("%s%11.4f  \n", label[j], 1.0E-06 * bytes[j]/mintime[j]);
  }
  /* printf(HLINE);*/
#if WITH_PTHREADS
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#endif
  /* --- Check Results --- */
  checkSTREAMresults();
  /*    printf(HLINE);*/
  PetscFinalize();
  return 0;
}
예제 #7
0
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);
    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
    printf("Total memory required = %.1f MB.\n",
	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel private(k)
    {
//    k = omp_get_num_threads();
  //  printf ("Number of Threads requested = %i\n",k);
    }
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) {
//	printf("Your clock granularity/precision appears to be "
//	    "%d microseconds.\n", quantum);
    } else {
//	printf("Your clock granularity appears to be "
//	    "less than one microsecond.\n");
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

//    printf("Each test below will take on the order"
//	" of %d microseconds.\n", (int) t  );
//    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

/*	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);*/
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
예제 #8
0
파일: stream.c 프로젝트: pousa/minas
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

#ifdef MAI
    mai_init(NULL);
    a = mai_alloc_1D(N, sizeof(double),DOUBLE);
    b = mai_alloc_1D(N, sizeof(double),DOUBLE);
    c = mai_alloc_1D(N, sizeof(double),DOUBLE);
    mai_bind_columns(a);
    mai_bind_columns(b);
    mai_bind_columns(c);
#else
    a = malloc(N*sizeof(double));
    b = malloc(N*sizeof(double));
    c = malloc(N*sizeof(double));
#endif

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);

    printf("Total memory required = %.1f MB.\n",
	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

#ifdef MAI
    mai_cyclic(a);
    mai_cyclic(b);
    mai_cyclic(c);
#endif

    int chunk = 128;

    t = mysecond();
#pragma omp parallel for schedule(dynamic,chunk)
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
예제 #9
0
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    BytesPerWord = sizeof(STREAM_TYPE);

#ifdef _OPENMP
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}


    if  ( (quantum = checktick()) >= 1) 
	printf("");
    else {
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    for (j=0; j<1; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%11.6f\n", 1.0E-06 * bytes[j]/mintime[j]);
    }

    /* --- Check Results --- */
    checkSTREAMresults();

    return 0;
}
예제 #10
0
파일: stream.c 프로젝트: jeffhammond/oshmpi
int
main ()
{
  int quantum = -1, checktick ();
  int BytesPerWord;
  int k;
  ssize_t j, i;
  STREAM_TYPE scalar;
  // process local counters
  int count_p = 0, next_p = 0;
  gcounter = 0;

  /* --- SETUP --- determine precision and check timing --- */

  printf (HLINE);
  printf ("STREAM version $Revision: 5.10 $\n");
  printf (HLINE);
  BytesPerWord = sizeof (STREAM_TYPE);
  printf ("This system uses %d bytes per array element.\n", BytesPerWord);
  /* SHMEM initialize */
  start_pes (0);
  _world_size = _num_pes ();
  _world_rank = _my_pe ();
  /* wait for user to input runtime params */
  for (int j = 0; j < _SHMEM_BARRIER_SYNC_SIZE; j++)
    {
      pSync0[j] = pSync1[j] = pSync2[j] = _SHMEM_SYNC_VALUE;
    }

  if (_world_rank == 0)
    {
      printf (HLINE);
#ifdef N
      printf ("*****  WARNING: ******\n");
      printf
	("      It appears that you set the preprocessor variable N when compiling this code.\n");
      printf
	("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
      printf ("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",
	      (unsigned long long) STREAM_ARRAY_SIZE);
      printf ("*****  WARNING: ******\n");
#endif

      printf ("Array size = %llu (elements), Offset = %d (elements)\n",
	      (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
      printf ("Memory per array = %.1f MiB (= %.1f GiB).\n",
	      BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0),
	      BytesPerWord * ((double) STREAM_ARRAY_SIZE / 1024.0 / 1024.0 /
			      1024.0));
      printf ("Total memory required = %.1f MiB (= %.1f GiB).\n",
	      (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 /
				      1024.),
	      (3.0 * BytesPerWord) * ((double) STREAM_ARRAY_SIZE / 1024.0 /
				      1024. / 1024.));
      printf ("Each kernel will be executed %d times.\n", NTIMES);
      printf
	(" The *best* time for each kernel (excluding the first iteration)\n");
      printf (" will be used to compute the reported bandwidth.\n");
      printf ("Number of SHMEM PEs requested = %i\n", _world_size);
    }

  int blocksize = 10000;
  assert (STREAM_ARRAY_SIZE % blocksize == 0);

  // do something really minor
  /* Get initial value for system clock. */
  for (j = 0; j < STREAM_ARRAY_SIZE; j++)
    {
      a[j] = 1.0;
      b[j] = 2.0;
      c[j] = 0.0;
    }

  printf (HLINE);

  if (_world_rank == 0)
    {
      if ((quantum = checktick ()) >= 1)
	printf ("Your clock granularity/precision appears to be "
		"%d microseconds.\n", quantum);
      else
	{
	  printf ("Your clock granularity appears to be "
		  "less than one microsecond.\n");
	  quantum = 1;
	}
    }

  // assign fixed iterations per PE

  // since we know default STREAM array size
  // we are hardcoding this, but if the value
  // changes, then this blocking factor must
  // also change
  // basically, each PE works on this block
  // size at a time

  time_start = mysecond ();
  /* Initialize */
  next_p = shmem_int_fadd (&gcounter, 1, ROOT);
  for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
    {
      if (next_p == count_p)
	{
	  for (i = j; i < (j + blocksize); i++)
	    {
	      a[i] = 2.0E0 * a[i];
	    }
	  next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	}
      count_p++;
    }
  time_end = mysecond ();
  clock_time_PE = time_end - time_start;
  shmem_double_sum_to_all (&total_clock_time, &clock_time_PE, 1,
			   0, 0, _world_size, pWrk0, pSync0);

  if (_world_rank == 0)
    {
      printf ("Each test below will take on the order"
	      " of %d microseconds.\n", (int) (total_clock_time * 1.0E6));
      printf ("   (= %d clock ticks)\n",
	      (int) ((1.0E6 * total_clock_time) / quantum));
      printf ("Increase the size of the arrays if this shows that\n");
      printf ("you are not getting at least 20 clock ticks per test.\n");

      printf (HLINE);

      printf ("WARNING -- The above is only a rough guideline.\n");
      printf ("For best results, please be sure you know the\n");
      printf ("precision of your system timer.\n");
      printf (HLINE);
    }
  /*      --- MAIN LOOP --- repeat test cases NTIMES times --- */

  // reduction required, as each PE only fills a,b,c partially
  scalar = 3.0;

  for (k = 0; k < NTIMES; k++)
    {
      // this is required for correctness
      // for NTIMES > 1 which is typically
      // the case
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  a[i] = 1.0;
		  b[i] = 2.0;
		  c[i] = 0.0;
		  a[i] = 2.0E0 * a[i];

		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (a + j, a + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  c[i] = a[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (c + j, c + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_max_to_all (&times[0][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  b[i] = scalar * c[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (b + j, b + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_sum_to_all (&times[1][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  c[i] = a[i] + b[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (c + j, c + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_sum_to_all (&times[2][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);

      time_start = mysecond ();
      for (j = 0; j < STREAM_ARRAY_SIZE; j += blocksize)
	{
	  if (next_p == count_p)
	    {
	      for (i = j; i < (j + blocksize); i++)
		{
		  a[i] = b[i] + scalar * c[i];
		}
	      next_p = shmem_int_fadd (&gcounter, 1, ROOT);
	    }
	  count_p++;
	  shmem_double_max_to_all (a + j, a + j, blocksize, 0,
				   0, _world_size, pWrk1, pSync1);
	}
      shmem_barrier_all ();
      time_end = mysecond () - time_start;
      shmem_double_sum_to_all (&times[3][k], &time_end, 1,
			       0, 0, _world_size, pWrk0, pSync0);
    }

  shmem_barrier_all ();

  /*      --- SUMMARY --- */

  for (k = 1; k < NTIMES; k++)	/* note -- skip first iteration */
    {
      for (j = 0; j < 4; j++)
	{
	  avgtime[j] = avgtime[j] + times[j][k];
	  mintime[j] = MIN (mintime[j], times[j][k]);
	  maxtime[j] = MAX (maxtime[j], times[j][k]);
	}
    }

  if (_world_rank == 0)
    {
      printf
	("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
      for (j = 0; j < 4; j++)
	{
	  avgtime[j] = avgtime[j] / (double) (NTIMES - 1);

	  printf ("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
		  1.0E-06 * bytes[j] / mintime[j],
		  avgtime[j], mintime[j], maxtime[j]);
	}
      printf (HLINE);
    }
  /* --- Check Results --- */
  if (_world_rank == 0)
    {
      checkSTREAMresults ();
      printf (HLINE);
    }
  return 0;
}
예제 #11
0
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			i,k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];
	double		*TimesByRank;
	double		t0,t1,tmin;
	int         rc, numranks, myrank;
	STREAM_TYPE	AvgError[3] = {0.0,0.0,0.0};
	STREAM_TYPE *AvgErrByRank;

    /* --- SETUP --- call MPI_Init() before anything else! --- */

    rc = MPI_Init(NULL, NULL);
	t0 = MPI_Wtime();
    if (rc != MPI_SUCCESS) {
       printf("ERROR: MPI Initialization failed with return code %d\n",rc);
       exit(1);
    }
	// if either of these fail there is something really screwed up!
	MPI_Comm_size(MPI_COMM_WORLD, &numranks);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */
	array_elements = STREAM_ARRAY_SIZE / numranks;		// don't worry about rounding vs truncation
    array_alignment = 64;						// Can be modified -- provides partial support for adjusting relative alignment

	// Dynamically allocate the three arrays using "posix_memalign()"
	// NOTE that the OFFSET parameter is not used in this version of the code!
    array_bytes = array_elements * sizeof(STREAM_TYPE);
    k = posix_memalign((void **)&a, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }
    k = posix_memalign((void **)&b, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }
    k = posix_memalign((void **)&c, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }

	// Initial informational printouts -- rank 0 handles all the output
	if (myrank == 0) {
		printf(HLINE);
		printf("STREAM version $Revision: 1.7 $\n");
		printf(HLINE);
		BytesPerWord = sizeof(STREAM_TYPE);
		printf("This system uses %d bytes per array element.\n",
		BytesPerWord);

		printf(HLINE);
#ifdef N
		printf("*****  WARNING: ******\n");
		printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
		printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
		printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
		printf("*****  WARNING: ******\n");
#endif
		if (OFFSET != 0) {
			printf("*****  WARNING: ******\n");
			printf("   This version ignores the OFFSET parameter.\n");
			printf("*****  WARNING: ******\n");
		}

		printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE);
		printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", 
			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
		printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n",
			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
		printf("Data is distributed across %d MPI ranks\n",numranks);
		printf("   Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements);
		printf("   Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", 
			BytesPerWord * ( (double) array_elements / 1024.0/1024.0),
			BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0));
		printf("   Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n",
			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.),
			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.));

		printf(HLINE);
		printf("Each kernel will be executed %d times.\n", NTIMES);
		printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
		printf(" will be used to compute the reported bandwidth.\n");
		printf("The SCALAR value used for this run is %f\n",SCALAR);

#ifdef _OPENMP
		printf(HLINE);
#pragma omp parallel 
		{
#pragma omp master
		{
			k = omp_get_num_threads();
			printf ("Number of Threads requested for each MPI rank = %i\n",k);
			}
		}
#endif

#ifdef _OPENMP
		k = 0;
#pragma omp parallel
#pragma omp atomic 
			k++;
		printf ("Number of Threads counted for rank 0 = %i\n",k);
#endif

	}

    /* --- SETUP --- initialize arrays and estimate precision of timer --- */

#pragma omp parallel for
    for (j=0; j<array_elements; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

	// Rank 0 needs to allocate arrays to hold error data and timing data from
	// all ranks for analysis and output.
	// Allocate and instantiate the arrays here -- after the primary arrays 
	// have been instantiated -- so there is no possibility of having these 
	// auxiliary arrays mess up the NUMA placement of the primary arrays.

	if (myrank == 0) {
		// There are 3 average error values for each rank (using STREAM_TYPE).
		AvgErrByRank = (double *) malloc(3 * sizeof(STREAM_TYPE) * numranks);
		if (AvgErrByRank == NULL) {
			printf("Ooops -- allocation of arrays to collect errors on MPI rank 0 failed\n");
			MPI_Abort(MPI_COMM_WORLD, 2);
		}
		memset(AvgErrByRank,0,3*sizeof(STREAM_TYPE)*numranks);

		// There are 4*NTIMES timing values for each rank (always doubles)
		TimesByRank = (double *) malloc(4 * NTIMES * sizeof(double) * numranks);
		if (TimesByRank == NULL) {
			printf("Ooops -- allocation of arrays to collect timing data on MPI rank 0 failed\n");
			MPI_Abort(MPI_COMM_WORLD, 3);
		}
		memset(TimesByRank,0,4*NTIMES*sizeof(double)*numranks);
	}

	// Simple check for granularity of the timer being used
	if (myrank == 0) {
		printf(HLINE);

		if  ( (quantum = checktick()) >= 1) 
		printf("Your timer granularity/precision appears to be "
			"%d microseconds.\n", quantum);
		else {
		printf("Your timer granularity appears to be "
			"less than one microsecond.\n");
		quantum = 1;
		}
	}

    /* Get initial timing estimate to compare to timer granularity. */
	/* All ranks need to run this code since it changes the values in array a */
    t = MPI_Wtime();
#pragma omp parallel for
    for (j = 0; j < array_elements; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (MPI_Wtime() - t);

	if (myrank == 0) {
		printf("Each test below will take on the order"
		" of %d microseconds.\n", (int) t  );
		printf("   (= %d timer ticks)\n", (int) (t/quantum) );
		printf("Increase the size of the arrays if this shows that\n");
		printf("you are not getting at least 20 timer ticks per test.\n");

		printf(HLINE);

		printf("WARNING -- The above is only a rough guideline.\n");
		printf("For best results, please be sure you know the\n");
		printf("precision of your system timer.\n");
		printf(HLINE);
#ifdef VERBOSE
		t1 = MPI_Wtime();
		printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0);
		printf(HLINE);
#endif
	}
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    // This code has more barriers and timing calls than are actually needed, but
    // this should not cause a problem for arrays that are large enough to satisfy
    // the STREAM run rules.

    scalar = SCALAR;
    for (k=0; k<NTIMES; k++)
	{
		// kernel 1: Copy
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			c[j] = a[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[0][k] = t1 - t0;

		// kernel 2: Scale
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			b[j] = scalar*c[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[1][k] = t1-t0;
	
		// kernel 3: Add
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			c[j] = a[j]+b[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[2][k] = t1-t0;
	
		// kernel 4: Triad
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			a[j] = b[j]+scalar*c[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[3][k] = t1-t0;
	}

	t0 = MPI_Wtime();

    /*	--- SUMMARY --- */

	// Because of the MPI_Barrier() calls, the timings from any thread are equally valid. 
    // The best estimate of the maximum performance is the minimum of the "outside the barrier"
    // timings across all the MPI ranks.

	// Gather all timing data to MPI rank 0
	MPI_Gather(times, 4*NTIMES, MPI_DOUBLE, TimesByRank, 4*NTIMES, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	// Rank 0 processes all timing data
	if (myrank == 0) {
		// for each iteration and each kernel, collect the minimum time across all MPI ranks
		// and overwrite the rank 0 "times" variable with the minimum so the original post-
		// processing code can still be used.
		for (k=0; k<NTIMES; k++) {
			for (j=0; j<4; j++) {
				tmin = 1.0e36;
				for (i=0; i<numranks; i++) {
					// printf("DEBUG: Timing: iter %d, kernel %lu, rank %d, tmin %f, TbyRank %f\n",k,j,i,tmin,TimesByRank[4*NTIMES*i+j*NTIMES+k]);
					tmin = MIN(tmin, TimesByRank[4*NTIMES*i+j*NTIMES+k]);
				}
				// printf("DEBUG: Final Timing: iter %d, kernel %lu, final tmin %f\n",k,j,tmin);
				times[j][k] = tmin;
			}
		}

	// Back to the original code, but now using the minimum global timing across all ranks
		for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
		{
		for (j=0; j<4; j++)
			{
			avgtime[j] = avgtime[j] + times[j][k];
			mintime[j] = MIN(mintime[j], times[j][k]);
			maxtime[j] = MAX(maxtime[j], times[j][k]);
			}
		}
    
		// note that "bytes[j]" is the aggregate array size, so no "numranks" is needed here
		printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
		for (j=0; j<4; j++) {
			avgtime[j] = avgtime[j]/(double)(NTIMES-1);

			printf("%s%11.1f  %11.6f  %11.6f  %11.6f\n", label[j],
			   1.0E-06 * bytes[j]/mintime[j],
			   avgtime[j],
			   mintime[j],
			   maxtime[j]);
		}
		printf(HLINE);
	}

    /* --- Every Rank Checks its Results --- */
#ifdef INJECTERROR
	a[11] = 100.0 * a[11];
#endif
	computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]);
	/* --- Collect the Average Errors for Each Array on Rank 0 --- */
	MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	/* -- Combined averaged errors and report on Rank 0 only --- */
	if (myrank == 0) {
#ifdef VERBOSE
		for (k=0; k<numranks; k++) {
			printf("VERBOSE: rank %d, AvgErrors %e %e %e\n",k,AvgErrByRank[3*k+0],
				AvgErrByRank[3*k+1],AvgErrByRank[3*k+2]);
		}
#endif
		checkSTREAMresults(AvgErrByRank,numranks);
		printf(HLINE);
	}

#ifdef VERBOSE
	if (myrank == 0) {
		t1 = MPI_Wtime();
		printf("VERBOSE: total shutdown time for rank %d = %f seconds\n",myrank,t1-t0);
	}
#endif

	free(a);
	free(b);
	free(c);
	if (myrank == 0) {
		free(TimesByRank);
		free(AvgErrByRank);
	}

    MPI_Finalize();
	return(0);
}
예제 #12
0
파일: stream.c 프로젝트: CoryXie/Graphite
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord);

    printf(HLINE);
    printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET);

    printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

    printf(HLINE);

    /* Get initial value for system clock. */
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
    }

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
        printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
    
    printf("--- MAIN LOOP --- repeat test cases %d times ---\n", NTIMES);
    printf(HLINE);

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
    {
	times[0][k] = mysecond();
	for (j=0; j<N; j++)
	    c[j] = a[j];
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
	times[3][k] = mysecond() - times[3][k];

        // printf("Iteration(%i): Copy(%.4f), Scale(%.4f), Add(%.4f), Triad(%.4f)\n",
        //       k, times[0][k], times[1][k], times[2][k], times[3][k]);
    }

    /*	--- SUMMARY --- */

    printf("--- SUMMARY ---\n");
    printf(HLINE);

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
    {
	for (j=0; j<4; j++)
        {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
        }
    }
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    
    printf("--- Check Results ---\n");
    printf(HLINE);
    
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
예제 #13
0
int
main(int argc, char **argv)
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];
#ifdef ENABLE_DYNAMIC_ALLOC
    int	err = 0;
    memkind_t kind;
    char err_msg[ERR_MSG_SIZE];
    if (argc > 1 && (strncmp("--help", argv[1], strlen("--help")) == 0 ||
                     strncmp("-h", argv[1], strlen("-h")) == 0)) {
        printf("Usage: %s [memkind_default | memkind_hbw | memkind_hbw_hugetlb | \n" 
               "    memkind_hbw_preferred | memkind_hbw_preferred_hugetlb | \n"
               "    memkind_hbw_gbtlb | memkind_hbw_preferred_gbtlb | memkind_gbtlb | \n"
               "    memkind_hbw_interleave | memkind_interleave]\n",
               argv[0]);
        return 0;
    }
#endif

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.10 $\n");
#ifdef ENABLE_DYNAMIC_ALLOC
    printf("Variant that uses the memkind library for dynamic memory allocation.\n");
#endif
    printf(HLINE);
    BytesPerWord = sizeof(STREAM_TYPE);
    printf("This system uses %d bytes per array element.\n",
	BytesPerWord);

    printf(HLINE);
#ifdef N
    printf("*****  WARNING: ******\n");
    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
    printf("*****  WARNING: ******\n");
#endif

    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
    printf("Each kernel will be executed %d times.\n", NTIMES);
    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
    printf(" will be used to compute the reported bandwidth.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
    printf ("Number of Threads counted = %i\n",k);
#endif

#ifdef ENABLE_DYNAMIC_ALLOC
    if (argc > 1) {
        err = memkind_get_kind_by_name(argv[1], &kind);
    }
    else {
        err = memkind_get_kind_by_name("memkind_default", &kind);
    }
    if (err) {
        memkind_error_message(err, err_msg, ERR_MSG_SIZE);
        fprintf(stderr, "ERROR: %s\n", err_msg);
        return -1;
    }
    err = memkind_posix_memalign(kind, (void **)&a, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET));
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate stream array a\n");
        return -err;
    }
    err = memkind_posix_memalign(kind, (void **)&b, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET));
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate stream array b\n");
        return -err;
    }
    err = memkind_posix_memalign(kind, (void **)&c, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET));
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate stream array c\n");
        return -err;
    }

#endif
    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

#ifdef ENABLE_DYNAMIC_ALLOC
    memkind_free(kind, c);
    memkind_free(kind, b);
    memkind_free(kind, a);
#endif
    return 0;
}