Ejemplos de tuned_STREAM_Add en C++ (Cpp)

Lenguaje de programación: C++ (Cpp)

Método / Función: tuned_STREAM_Add

Ejemplos en hotexamples.com: 8

C++ (Cpp) tuned_STREAM_Add - 8 ejemplos encontrados. Estos son los ejemplos en C++ (Cpp) del mundo real mejor valorados de tuned_STREAM_Add extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: cStream.cpp Proyecto: van-smith/miniBench

void cStream::runTunedTests()
{
  register int k;
  /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */
  for (k=0; k<NTIMES; k++)
	{
	  //times[0][k] = timeGetTime();
#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&start );
#else
    start = clock();
#endif
    tuned_STREAM_Copy();
#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&stop );
#else
    stop = clock();
#endif
    times[0][k] = stop - start;

#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&start );
#else
    start = clock();
#endif
    tuned_STREAM_Scale(scalar);
#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&stop );
#else
    stop = clock();
#endif
    times[1][k] = stop - start;

#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&start );
#else
    start = clock();
#endif
    tuned_STREAM_Add();
#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&stop );
#else
    stop = clock();
#endif
		times[2][k] = stop - start;

#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&start );
#else
    start = clock();
#endif
    tuned_STREAM_Triad(scalar);
#ifdef vs2005
    QueryPerformanceCounter( ( LARGE_INTEGER *)&stop );
#else
    stop = clock();
#endif
	  times[3][k] = stop - start;
	} // for
} // void cStream::runTunedTests();

Ejemplo n.º 2

Mostrar archivo

Archivo: stream.c Proyecto: ecylmz/docker-rkt-kvm-comp

int
HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs,
  double *triadGBs, int *failure) {
    int   quantum;
    int   BytesPerWord;
    register int j, k;
    double  scalar, t, times[4][NTIMES];
    FILE *outFile;
    double GiBs = 1073741824.0, curGBs;

    if (doIO) {
      // outFile = fopen( params->outFname, "w+" );
	  outFile = stdout;
      if (! outFile) {
        outFile = stderr;
        fprintf( outFile, "Cannot open output file.\n" );
        return 1;
      }
    }

    // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */
	// HARDCODED VectorSize
    // params->StreamVectorSize = VectorSize;

    a = HPCC_XMALLOC( double, VectorSize );
    b = HPCC_XMALLOC( double, VectorSize );
    c = HPCC_XMALLOC( double, VectorSize );

    if (!a || !b || !c) {
      if (c) HPCC_free(c);
      if (b) HPCC_free(b);
      if (a) HPCC_free(a);
      if (doIO) {
        fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize );
        fflush( outFile );
        fclose( outFile );
      }
      return 1;
    }

    /* --- SETUP --- determine precision and check timing --- */

    if (doIO) {
    fprintf (outFile, "Generated on %s\n", params->nowASCII);
    fprintf( outFile, HLINE);
    BytesPerWord = sizeof(double);
    fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n",
             BytesPerWord);

    fprintf( outFile, HLINE);
    fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET);
    fprintf( outFile, "Total memory required = %.4f GiB.\n",
             (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs));
    fprintf( outFile, "Each test is run %d times, but only\n", NTIMES);
    fprintf( outFile, "the *best* time for each is used.\n");
	fflush ( outFile);
    }

#ifdef _OPENMP
    if (doIO) fprintf( outFile, HLINE);
#pragma omp parallel private(k)
    {
#pragma omp single nowait
      {
        k = omp_get_num_threads();
        if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k);
        params->StreamThreads = k;
      }
    }
#endif

    /* Get initial value for system clock. */
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j=0; j<VectorSize; j++) {
      a[j] = 1.0;
      b[j] = 2.0;
      c[j] = 0.0;
    }

    if (doIO) fprintf( outFile, HLINE);

    if  ( (quantum = checktick()) >= 1) {
      if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be "
                         "%d microseconds.\n", quantum);
    } else {
      if (doIO) fprintf( outFile, "Your clock granularity appears to be "
                         "less than one microsecond.\n");
    }

    t = mysecond();
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j = 0; j < VectorSize; j++)
      a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    if (doIO) {
    fprintf( outFile, "Each test below will take on the order"
             " of %d microseconds.\n", (int) t  );
    fprintf( outFile, "   (= %d clock ticks)\n", (int) (t/quantum) );
    fprintf( outFile, "Increase the size of the arrays if this shows that\n");
    fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n");

    fprintf( outFile, HLINE);

    fprintf( outFile, "WARNING -- The above is only a rough guideline.\n");
    fprintf( outFile, "For best results, please be sure you know the\n");
    fprintf( outFile, "precision of your system timer.\n");
    fprintf( outFile, HLINE);
    }

    /* --- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
    {
      times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j];
#endif
        times[0][k] = mysecond() - times[0][k];

        times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          b[j] = scalar*c[j];
#endif
        times[1][k] = mysecond() - times[1][k];

        times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j]+b[j];
#endif
        times[2][k] = mysecond() - times[2][k];

        times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          a[j] = b[j]+scalar*c[j];
#endif
        times[3][k] = mysecond() - times[3][k];
    }

    /* --- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
    {
      for (j=0; j<4; j++)
      {
        avgtime[j] = avgtime[j] + times[j][k];
        mintime[j] = Mmin(mintime[j], times[j][k]);
        maxtime[j] = Mmax(maxtime[j], times[j][k]);
      }
    }

    if (doIO)
    fprintf( outFile, "Function      Rate (GB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
      avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */

      /* make sure no division by zero */
      curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0);
      curGBs *= 1e-9 * bytes[j] * VectorSize;
        if (doIO)
          fprintf( outFile, "%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
                   curGBs,
                   avgtime[j],
                   mintime[j],
                   maxtime[j]);
        switch (j) {
          case 0: *copyGBs = curGBs; break;
          case 1: *scaleGBs = curGBs; break;
          case 2: *addGBs = curGBs; break;
          case 3: *triadGBs = curGBs; break;
        }
    }
    if (doIO) fprintf( outFile, HLINE);

    /* --- Check Results --- */
    checkSTREAMresults( outFile, doIO, failure );
    if (doIO) fprintf( outFile, HLINE);

    HPCC_free(c);
    HPCC_free(b);
    HPCC_free(a);

    if (doIO) {
      fflush( outFile );
      fclose( outFile );
    }

    return 0;
}

Ejemplo n.º 3

Mostrar archivo

Archivo: stream.cpp Proyecto: andreashappe/IncludeOS

int
main()
{
    int     checktick(void);
    int			quantum;
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.10 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(STREAM_TYPE);
    printf("This system uses %d bytes per array element.\n",
	BytesPerWord);

    printf(HLINE);
#ifdef N
    printf("*****  WARNING: ******\n");
    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
    printf("*****  WARNING: ******\n");
#endif

    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
    printf("Each kernel will be executed %d times.\n", NTIMES);
    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
    printf(" will be used to compute the reported bandwidth.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
    printf ("Number of Threads counted = %i\n",k);
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}

Ejemplo n.º 4

Mostrar archivo

Archivo: stream_omp.c Proyecto: HerbertJordan/insieme

int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);
    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
    printf("Total memory required = %.1f MB.\n",
	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel private(k)
    {
//    k = omp_get_num_threads();
  //  printf ("Number of Threads requested = %i\n",k);
    }
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) {
//	printf("Your clock granularity/precision appears to be "
//	    "%d microseconds.\n", quantum);
    } else {
//	printf("Your clock granularity appears to be "
//	    "less than one microsecond.\n");
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

//    printf("Each test below will take on the order"
//	" of %d microseconds.\n", (int) t  );
//    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

/*	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);*/
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}

Ejemplo n.º 5

Mostrar archivo

Archivo: stream.c Proyecto: pousa/minas

int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

#ifdef MAI
    mai_init(NULL);
    a = mai_alloc_1D(N, sizeof(double),DOUBLE);
    b = mai_alloc_1D(N, sizeof(double),DOUBLE);
    c = mai_alloc_1D(N, sizeof(double),DOUBLE);
    mai_bind_columns(a);
    mai_bind_columns(b);
    mai_bind_columns(c);
#else
    a = malloc(N*sizeof(double));
    b = malloc(N*sizeof(double));
    c = malloc(N*sizeof(double));
#endif

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);

    printf("Total memory required = %.1f MB.\n",
	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

#ifdef MAI
    mai_cyclic(a);
    mai_cyclic(b);
    mai_cyclic(c);
#endif

    int chunk = 128;

    t = mysecond();
#pragma omp parallel for schedule(dynamic,chunk)
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}

Ejemplo n.º 6

Mostrar archivo

Archivo: stream.c Proyecto: raddirad/xeon_phi_kernel_integration

int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    BytesPerWord = sizeof(STREAM_TYPE);

#ifdef _OPENMP
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}


    if  ( (quantum = checktick()) >= 1) 
	printf("");
    else {
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    for (j=0; j<1; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%11.6f\n", 1.0E-06 * bytes[j]/mintime[j]);
    }

    /* --- Check Results --- */
    checkSTREAMresults();

    return 0;
}

Ejemplo n.º 7

Mostrar archivo

Archivo: stream_mpi.c Proyecto: konchalas/tools

int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			i,k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];
	double		*TimesByRank;
	double		t0,t1,tmin;
	int         rc, numranks, myrank;
	STREAM_TYPE	AvgError[3] = {0.0,0.0,0.0};
	STREAM_TYPE *AvgErrByRank;

    /* --- SETUP --- call MPI_Init() before anything else! --- */

    rc = MPI_Init(NULL, NULL);
	t0 = MPI_Wtime();
    if (rc != MPI_SUCCESS) {
       printf("ERROR: MPI Initialization failed with return code %d\n",rc);
       exit(1);
    }
	// if either of these fail there is something really screwed up!
	MPI_Comm_size(MPI_COMM_WORLD, &numranks);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

    /* --- NEW FEATURE --- distribute requested storage across MPI ranks --- */
	array_elements = STREAM_ARRAY_SIZE / numranks;		// don't worry about rounding vs truncation
    array_alignment = 64;						// Can be modified -- provides partial support for adjusting relative alignment

	// Dynamically allocate the three arrays using "posix_memalign()"
	// NOTE that the OFFSET parameter is not used in this version of the code!
    array_bytes = array_elements * sizeof(STREAM_TYPE);
    k = posix_memalign((void **)&a, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array a failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }
    k = posix_memalign((void **)&b, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array b failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }
    k = posix_memalign((void **)&c, array_alignment, array_bytes);
    if (k != 0) {
        printf("Rank %d: Allocation of array c failed, return code is %d\n",myrank,k);
		MPI_Abort(MPI_COMM_WORLD, 2);
        exit(1);
    }

	// Initial informational printouts -- rank 0 handles all the output
	if (myrank == 0) {
		printf(HLINE);
		printf("STREAM version $Revision: 1.7 $\n");
		printf(HLINE);
		BytesPerWord = sizeof(STREAM_TYPE);
		printf("This system uses %d bytes per array element.\n",
		BytesPerWord);

		printf(HLINE);
#ifdef N
		printf("*****  WARNING: ******\n");
		printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
		printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
		printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
		printf("*****  WARNING: ******\n");
#endif
		if (OFFSET != 0) {
			printf("*****  WARNING: ******\n");
			printf("   This version ignores the OFFSET parameter.\n");
			printf("*****  WARNING: ******\n");
		}

		printf("Total Aggregate Array size = %llu (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE);
		printf("Total Aggregate Memory per array = %.1f MiB (= %.1f GiB).\n", 
			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
			BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
		printf("Total Aggregate memory required = %.1f MiB (= %.1f GiB).\n",
			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
			(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
		printf("Data is distributed across %d MPI ranks\n",numranks);
		printf("   Array size per MPI rank = %llu (elements)\n" , (unsigned long long) array_elements);
		printf("   Memory per array per MPI rank = %.1f MiB (= %.1f GiB).\n", 
			BytesPerWord * ( (double) array_elements / 1024.0/1024.0),
			BytesPerWord * ( (double) array_elements / 1024.0/1024.0/1024.0));
		printf("   Total memory per MPI rank = %.1f MiB (= %.1f GiB).\n",
			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024.),
			(3.0 * BytesPerWord) * ( (double) array_elements / 1024.0/1024./1024.));

		printf(HLINE);
		printf("Each kernel will be executed %d times.\n", NTIMES);
		printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
		printf(" will be used to compute the reported bandwidth.\n");
		printf("The SCALAR value used for this run is %f\n",SCALAR);

#ifdef _OPENMP
		printf(HLINE);
#pragma omp parallel 
		{
#pragma omp master
		{
			k = omp_get_num_threads();
			printf ("Number of Threads requested for each MPI rank = %i\n",k);
			}
		}
#endif

#ifdef _OPENMP
		k = 0;
#pragma omp parallel
#pragma omp atomic 
			k++;
		printf ("Number of Threads counted for rank 0 = %i\n",k);
#endif

	}

    /* --- SETUP --- initialize arrays and estimate precision of timer --- */

#pragma omp parallel for
    for (j=0; j<array_elements; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

	// Rank 0 needs to allocate arrays to hold error data and timing data from
	// all ranks for analysis and output.
	// Allocate and instantiate the arrays here -- after the primary arrays 
	// have been instantiated -- so there is no possibility of having these 
	// auxiliary arrays mess up the NUMA placement of the primary arrays.

	if (myrank == 0) {
		// There are 3 average error values for each rank (using STREAM_TYPE).
		AvgErrByRank = (double *) malloc(3 * sizeof(STREAM_TYPE) * numranks);
		if (AvgErrByRank == NULL) {
			printf("Ooops -- allocation of arrays to collect errors on MPI rank 0 failed\n");
			MPI_Abort(MPI_COMM_WORLD, 2);
		}
		memset(AvgErrByRank,0,3*sizeof(STREAM_TYPE)*numranks);

		// There are 4*NTIMES timing values for each rank (always doubles)
		TimesByRank = (double *) malloc(4 * NTIMES * sizeof(double) * numranks);
		if (TimesByRank == NULL) {
			printf("Ooops -- allocation of arrays to collect timing data on MPI rank 0 failed\n");
			MPI_Abort(MPI_COMM_WORLD, 3);
		}
		memset(TimesByRank,0,4*NTIMES*sizeof(double)*numranks);
	}

	// Simple check for granularity of the timer being used
	if (myrank == 0) {
		printf(HLINE);

		if  ( (quantum = checktick()) >= 1) 
		printf("Your timer granularity/precision appears to be "
			"%d microseconds.\n", quantum);
		else {
		printf("Your timer granularity appears to be "
			"less than one microsecond.\n");
		quantum = 1;
		}
	}

    /* Get initial timing estimate to compare to timer granularity. */
	/* All ranks need to run this code since it changes the values in array a */
    t = MPI_Wtime();
#pragma omp parallel for
    for (j = 0; j < array_elements; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (MPI_Wtime() - t);

	if (myrank == 0) {
		printf("Each test below will take on the order"
		" of %d microseconds.\n", (int) t  );
		printf("   (= %d timer ticks)\n", (int) (t/quantum) );
		printf("Increase the size of the arrays if this shows that\n");
		printf("you are not getting at least 20 timer ticks per test.\n");

		printf(HLINE);

		printf("WARNING -- The above is only a rough guideline.\n");
		printf("For best results, please be sure you know the\n");
		printf("precision of your system timer.\n");
		printf(HLINE);
#ifdef VERBOSE
		t1 = MPI_Wtime();
		printf("VERBOSE: total setup time for rank 0 = %f seconds\n",t1-t0);
		printf(HLINE);
#endif
	}
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    // This code has more barriers and timing calls than are actually needed, but
    // this should not cause a problem for arrays that are large enough to satisfy
    // the STREAM run rules.

    scalar = SCALAR;
    for (k=0; k<NTIMES; k++)
	{
		// kernel 1: Copy
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			c[j] = a[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[0][k] = t1 - t0;

		// kernel 2: Scale
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			b[j] = scalar*c[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[1][k] = t1-t0;
	
		// kernel 3: Add
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			c[j] = a[j]+b[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[2][k] = t1-t0;
	
		// kernel 4: Triad
		MPI_Barrier(MPI_COMM_WORLD);
		t0 = MPI_Wtime();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
		for (j=0; j<array_elements; j++)
			a[j] = b[j]+scalar*c[j];
#endif
		MPI_Barrier(MPI_COMM_WORLD);
		t1 = MPI_Wtime();
		times[3][k] = t1-t0;
	}

	t0 = MPI_Wtime();

    /*	--- SUMMARY --- */

	// Because of the MPI_Barrier() calls, the timings from any thread are equally valid. 
    // The best estimate of the maximum performance is the minimum of the "outside the barrier"
    // timings across all the MPI ranks.

	// Gather all timing data to MPI rank 0
	MPI_Gather(times, 4*NTIMES, MPI_DOUBLE, TimesByRank, 4*NTIMES, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	// Rank 0 processes all timing data
	if (myrank == 0) {
		// for each iteration and each kernel, collect the minimum time across all MPI ranks
		// and overwrite the rank 0 "times" variable with the minimum so the original post-
		// processing code can still be used.
		for (k=0; k<NTIMES; k++) {
			for (j=0; j<4; j++) {
				tmin = 1.0e36;
				for (i=0; i<numranks; i++) {
					// printf("DEBUG: Timing: iter %d, kernel %lu, rank %d, tmin %f, TbyRank %f\n",k,j,i,tmin,TimesByRank[4*NTIMES*i+j*NTIMES+k]);
					tmin = MIN(tmin, TimesByRank[4*NTIMES*i+j*NTIMES+k]);
				}
				// printf("DEBUG: Final Timing: iter %d, kernel %lu, final tmin %f\n",k,j,tmin);
				times[j][k] = tmin;
			}
		}

	// Back to the original code, but now using the minimum global timing across all ranks
		for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
		{
		for (j=0; j<4; j++)
			{
			avgtime[j] = avgtime[j] + times[j][k];
			mintime[j] = MIN(mintime[j], times[j][k]);
			maxtime[j] = MAX(maxtime[j], times[j][k]);
			}
		}
    
		// note that "bytes[j]" is the aggregate array size, so no "numranks" is needed here
		printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
		for (j=0; j<4; j++) {
			avgtime[j] = avgtime[j]/(double)(NTIMES-1);

			printf("%s%11.1f  %11.6f  %11.6f  %11.6f\n", label[j],
			   1.0E-06 * bytes[j]/mintime[j],
			   avgtime[j],
			   mintime[j],
			   maxtime[j]);
		}
		printf(HLINE);
	}

    /* --- Every Rank Checks its Results --- */
#ifdef INJECTERROR
	a[11] = 100.0 * a[11];
#endif
	computeSTREAMerrors(&AvgError[0], &AvgError[1], &AvgError[2]);
	/* --- Collect the Average Errors for Each Array on Rank 0 --- */
	MPI_Gather(AvgError, 3, MPI_DOUBLE, AvgErrByRank, 3, MPI_DOUBLE, 0, MPI_COMM_WORLD);

	/* -- Combined averaged errors and report on Rank 0 only --- */
	if (myrank == 0) {
#ifdef VERBOSE
		for (k=0; k<numranks; k++) {
			printf("VERBOSE: rank %d, AvgErrors %e %e %e\n",k,AvgErrByRank[3*k+0],
				AvgErrByRank[3*k+1],AvgErrByRank[3*k+2]);
		}
#endif
		checkSTREAMresults(AvgErrByRank,numranks);
		printf(HLINE);
	}

#ifdef VERBOSE
	if (myrank == 0) {
		t1 = MPI_Wtime();
		printf("VERBOSE: total shutdown time for rank %d = %f seconds\n",myrank,t1-t0);
	}
#endif

	free(a);
	free(b);
	free(c);
	if (myrank == 0) {
		free(TimesByRank);
		free(AvgErrByRank);
	}

    MPI_Finalize();
	return(0);
}

Ejemplo n.º 8

Mostrar archivo

Archivo: stream_example.c Proyecto: ArturKoziej/memkind

int
main(int argc, char **argv)
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];
#ifdef ENABLE_DYNAMIC_ALLOC
    int	err = 0;
    memkind_t kind;
    char err_msg[ERR_MSG_SIZE];
    if (argc > 1 && (strncmp("--help", argv[1], strlen("--help")) == 0 ||
                     strncmp("-h", argv[1], strlen("-h")) == 0)) {
        printf("Usage: %s [memkind_default | memkind_hbw | memkind_hbw_hugetlb | \n" 
               "    memkind_hbw_preferred | memkind_hbw_preferred_hugetlb | \n"
               "    memkind_hbw_gbtlb | memkind_hbw_preferred_gbtlb | memkind_gbtlb | \n"
               "    memkind_hbw_interleave | memkind_interleave]\n",
               argv[0]);
        return 0;
    }
#endif

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.10 $\n");
#ifdef ENABLE_DYNAMIC_ALLOC
    printf("Variant that uses the memkind library for dynamic memory allocation.\n");
#endif
    printf(HLINE);
    BytesPerWord = sizeof(STREAM_TYPE);
    printf("This system uses %d bytes per array element.\n",
	BytesPerWord);

    printf(HLINE);
#ifdef N
    printf("*****  WARNING: ******\n");
    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
    printf("*****  WARNING: ******\n");
#endif

    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
    printf("Each kernel will be executed %d times.\n", NTIMES);
    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
    printf(" will be used to compute the reported bandwidth.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
    printf ("Number of Threads counted = %i\n",k);
#endif

#ifdef ENABLE_DYNAMIC_ALLOC
    if (argc > 1) {
        err = memkind_get_kind_by_name(argv[1], &kind);
    }
    else {
        err = memkind_get_kind_by_name("memkind_default", &kind);
    }
    if (err) {
        memkind_error_message(err, err_msg, ERR_MSG_SIZE);
        fprintf(stderr, "ERROR: %s\n", err_msg);
        return -1;
    }
    err = memkind_posix_memalign(kind, (void **)&a, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET));
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate stream array a\n");
        return -err;
    }
    err = memkind_posix_memalign(kind, (void **)&b, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET));
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate stream array b\n");
        return -err;
    }
    err = memkind_posix_memalign(kind, (void **)&c, 2097152, BytesPerWord * (STREAM_ARRAY_SIZE + OFFSET));
    if (err) {
        fprintf(stderr, "ERROR: Unable to allocate stream array c\n");
        return -err;
    }

#endif
    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

#ifdef ENABLE_DYNAMIC_ALLOC
    memkind_free(kind, c);
    memkind_free(kind, b);
    memkind_free(kind, a);
#endif
    return 0;
}