コード例 #1
0
ファイル: stream.c プロジェクト: ecylmz/docker-rkt-kvm-comp
static int
checktick()
    {
    int  i, minDelta, Delta;
    double t1, t2, timesfound[M];

/*  Collect a sequence of M unique time values from the system. */

    for (i = 0; i < M; i++) {
      t1 = mysecond();
      while( ((t2=mysecond()) - t1) < 1.0E-6 )
        ;
      timesfound[i] = t1 = t2;
    }

/*
 * Determine the minimum difference between these M values.
 * This result will be our estimate (in microseconds) for the
 * clock granularity.
 */

    minDelta = 1000000;
    for (i = 1; i < M; i++) {
      Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
      minDelta = Mmin(minDelta, Mmax(Delta,0));
    }

   return(minDelta);
    }
コード例 #2
0
ファイル: deps_shaping_02.c プロジェクト: LindaLovelace/rose
void lu_dependencies( double* M[NB][NB] )
{
    float t_start,t_end;
    float time;
    t_start=mysecond();

    int ii, jj, kk;
    for (kk=0; kk<NB; kk++) {
        {
            double *diag = M[kk][kk];
#pragma omp task depend(inout: [BSIZE][BSIZE]diag)
            lu0(diag);
        }
        for (jj=kk+1; jj<NB; jj++)
            if (M[kk][jj] != NULL) {
                double *diag = M[kk][kk];
                double *col = M[kk][jj];
#pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]col)
                fwd(diag, col);
            }
            
        for (ii=kk+1; ii<NB; ii++) {
            if (M[ii][kk] != NULL) {
                {
                    double *row = M[kk][kk];
                    double *diag = M[ii][kk];
#pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]row)
                    bdiv (diag, row);
                }

                for (jj=kk+1; jj<NB; jj++) {
                    if (M[kk][jj] != NULL) {
                        if (M[ii][jj]==NULL)
                            M[ii][jj]=allocate_clean_block();
                        {
                            double *row = M[ii][kk];
                            double *col = M[kk][jj];
                            double *inner = M[ii][jj];
#pragma omp task depend(in: [BSIZE][BSIZE]row, [BSIZE][BSIZE]col) depend(inout: [BSIZE][BSIZE]inner)
                            bmod(row, col, inner);
                        }    
                    }
                }
            }
        }
    }

#pragma omp taskwait

    t_end=mysecond();
    time = t_end-t_start;
    printf("Dependencies time to compute = %f usec\n", time);
}
コード例 #3
0
ファイル: deps_shaping_02.c プロジェクト: LindaLovelace/rose
void lu_serial( double* M[NB][NB] )
{
    float t_start,t_end;
    float time;
    t_start= mysecond();

    int ii, jj, kk;
    for (kk=0; kk<NB; kk++) {
        {
            double *diag = M[kk][kk];
            lu0(diag);
        }

        for (jj=kk+1; jj<NB; jj++)
            if (M[kk][jj] != NULL)
            {
                double *diag = M[kk][kk];
                double *col = M[kk][jj];
                fwd(diag, col);
            }

        for (ii=kk+1; ii<NB; ii++) {
            if (M[ii][kk] != NULL) {
                {
                    double *row = M[kk][kk];
                    double *diag = M[ii][kk];
                    bdiv (diag, row);
                }

                for (jj=kk+1; jj<NB; jj++) {
                    if (M[kk][jj] != NULL) {
                        if (M[ii][jj]==NULL)
                            M[ii][jj]=allocate_clean_block();
                        {
                            double *row = M[ii][kk];
                            double *col = M[kk][jj];
                            double *inner = M[ii][jj];
                            bmod(row, col, inner);
                        }
                    }
                }
            }
        }
    }

    t_end=mysecond();

    time = t_end-t_start;
    printf("Serial time to compute = %f usec\n", time);
}
// Prints the final result of the computation. Called as the last EDT.
ocrGuid_t finalPrintEdt(u32 paramc, u64 *paramv, u32 depc, ocrEdtDep_t depv[]) {
	int i;
	u64 N = paramv[0];
	bool verbose = paramv[1];
	bool printResults = paramv[2];
	float *data_in = (float*)depv[1].ptr;
	float *data_real = (float*)depv[2].ptr;
	float *data_imag = (float*)depv[3].ptr;
	float *x_in = (float*)data_in;
	float *X_real = (float*)(data_real);
	float *X_imag = (float*)(data_imag);
	double *startTime = (double*)(depv[4].ptr);

	if(verbose) {
		PRINTF("Final print EDT\n");
	}

	double endTime = mysecond();
	PRINTF("%f\n",endTime-*startTime);

	if(printResults) {
		PRINTF("Starting values:\n");
		for(i=0;i<N;i++) {
			PRINTF("%d [ %f ]\n",i,x_in[i]);
		}
		PRINTF("\n");

		PRINTF("Final result:\n");
		for(i=0;i<N;i++) {
			PRINTF("%d [%f + %fi]\n",i,X_real[i],X_imag[i]);
		}
	}
	ocrShutdown();
}
コード例 #5
0
ファイル: dgemm.c プロジェクト: ursache/HPC-hacks
double time_dgemm_blas(const int M, const unsigned N, const int K,
		const double alpha, const double *A, const int lda, 
		const double *B, const int ldb,
		const double beta, double *C, const int ldc)
{

	double mflops, mflop_s;
	double secs = -1;

	int num_iterations = NRUNS;
	int i;

	char transa = 'n';
	char transb = 'n';

	double* Ca = (double*) _mm_malloc(N*ldc*sizeof(double), 32);

	double cpu_time = 0;

	for (i = 0; i < num_iterations; ++i)
	{
		memcpy(Ca, C, N*ldc*sizeof(double));
		cpu_time -= mysecond();	
#ifdef PAPI
		PAPI_START;
#endif
		dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, Ca, &ldc);
#ifdef PAPI
		PAPI_STOP;
		PAPI_PRINT;
#endif
		//dgemm (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
		cpu_time += mysecond();
	}

	mflops  = 2.0*num_iterations*M*N*K/1.0e6;
	secs    = cpu_time;
	mflop_s = mflops/secs;

	memcpy(C, Ca, N*ldc*sizeof(double));
#ifdef PAPI
	PAPI_FLUSH;
#endif
	_mm_free(Ca);

	return mflop_s;
}
コード例 #6
0
ファイル: sub_copy.c プロジェクト: avr-aics-riken/PMlib
void stream_copy()
{
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, times[4][NTIMES];

    k = 0;
#ifdef _OPENMP
    k = omp_get_max_threads();
#endif
    printf("Modified STREAM COPY, num_threads=%d, array size= %d\n", k, N);

#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j];
	times[0][k] = mysecond() - times[0][k];

	}

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
		j=0;
		avgtime[j] = avgtime[j] + times[j][k];
		mintime[j] = MIN(mintime[j], times[j][k]);
		maxtime[j] = MAX(maxtime[j], times[j][k]);
	}
    printf("Function    Rate (MB/s)   Avg time     Min time     Max time\n");
	{
    j=0;
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);
	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);
    }
}
コード例 #7
0
void showStatsFooter() {
	HASSERT(benchmark_start_time_stats != 0);
	double dur = (((double)(mysecond()-benchmark_start_time_stats))/1000000) * 1000; //msec
	if(upcxx::global_myrank() == 0) {
		print_topology_information();
	}
	runtime_statistics(dur);
}
コード例 #8
0
ファイル: ex01.c プロジェクト: jeudyx/hpc_school_trieste
int main(int argc, char **argv) 
{
    int i, j;
    const int  n=1000, m=1000;
    double  a[n][m];
    double  t1,t2;

    t1=mysecond();
    for ( i=0; i<1000; ++i) {
        for ( j=0; j<1000; ++j) {
           a[i][j] = i+j;
        }
    }
    t2=mysecond();
  
    printf("time used %g\n", t2-t1);
    return 0;
}
コード例 #9
0
ファイル: mmc.c プロジェクト: jbradt/extreme-scale
int main(int argc, char *argv[])
{
    double sum, tStart, tEnd, tLoop, rate, t;
    int    i, j, k, tests;

    /* Initialize the matrics */
    /* Note that this is *not* in the best order with respect to cache;
       this will be discussed later in the course. */
    for (i=0; i<matSize; i++)
	for (j=0; j<matSize; j++) {
	    matA[ind(i,j)] = 1.0 + i;
	    matB[ind(i,j)] = 1.0 + j;
	    matC[ind(i,j)] = 0.0;
	}

    tLoop = 1.0e10;
    for (tests=0; tests<maxTest; tests++) {
	tStart = mysecond();
	for (i=0; i<matSize; i++)
	    for (j=0; j<matSize; j++) {
		sum = 0.0;
		for (k=0; k<matSize; k++)
		    sum += matA[ind(i,k)] * matB[ind(k,j)];
		matC[ind(i,j)] = sum;
	    }
	tEnd = mysecond();
	t = tEnd - tStart;
	dummy(matA, matB, matC);
	if (t < tLoop) tLoop = t;
	if (matC[ind(0,0)] < 0) {
	    fprintf(stderr, "Failed matC sign test\n");
	}
    }

    /* Note that explicit formats are used to limit the number of
       significant digits printed (at most this many digits are significant) */
    printf("Matrix size = %d\n", matSize);
    printf("Time        = %.2e secs\n", tLoop);
    rate = (2.0 * matSize) * matSize * (matSize / tLoop);
    printf("Rate        = %.2e MFLOP/s\n", rate * 1.0e-6);

    return 0;
}
コード例 #10
0
ファイル: blockTranspose.c プロジェクト: harterj/mth699
int main(int argc, char *argv[])
{
    /* Initialize random number generator */
    srand((int) time(&q));

    // Fill array with random numbers
    for (i = 0; i < m; ++i) {//row
      for (j = 0; j < n; ++j) {
        arrayOne[i][j] = rand() % (m * n);
      }
    }

      tLoop = 1.0e10;
      for (tests = 0; tests < maxTest; tests++) {
      tStart = mysecond(); // start timing outer loop
    // Perform the Transpose
      for (i = 0; i < m; i+=block) {
        for (j = 0; j < n; j+=block) {
          for (ii = i; ii < min(i+block-1,m); ii++) {
            for (jj = j; jj < min(j+block-1,n); jj++) {
                arrayTwo[ii][jj] = arrayOne[jj][ii];
              }
            }
          }
        }

      //
      tEnd = mysecond(); // end timing outer loop
        t = tEnd - tStart; // compute outer run time
        if (t < tLoop) tLoop = t; // set tLoop to t, the run time
      }

      /* Note that explicit formats are used to limit the number of
         significant digits printed (at most this many digits are significant) */
      printf("Matrix         = %d x %d\n", m, n);
      printf("Time           = %.2e secs\n", t);
      rate = (8.0 * m * n) / (t);
      printf("Rate               = %.2e MB/s\n", rate * 1.0e-6); // * 1.0e-6

  return 0;
}
コード例 #11
0
void showStatsHeader() {
	if(upcxx::global_myrank() == 0) {
		cout << endl;
		cout << "-----" << endl;
		cout << "mkdir timedrun fake" << endl;
		cout << endl;
	}
	initialize_hcWorker();
	if(upcxx::global_myrank() == 0) {
		cout << endl;
		cout << "-----" << endl;
	}
	benchmark_start_time_stats = mysecond();
}
コード例 #12
0
ファイル: timing.cpp プロジェクト: EBone/Faust
void endTiming(const char* msg)
{
    if (gTimingSwitch) {
        faustassert(gTimingIndex > 0);
        gEndTime[--gTimingIndex] = mysecond();
        if (gTimingLog) {
            *gTimingLog << msg << "\t" << gEndTime[gTimingIndex] - gStartTime[gTimingIndex] << endl;
            gTimingLog->flush();
        } else {
            tab(gTimingIndex, cerr);
            cerr << "end " << msg << " (duration : " << gEndTime[gTimingIndex] - gStartTime[gTimingIndex] << ")" << endl;
        }
    }
}
コード例 #13
0
void stats_initTimelineEvents() {
	if(app_total_time_estimate) {
		assert(app_total_time_estimate != NULL);
		app_tTotal = atof(app_total_time_estimate);
		app_tStart = mysecond();
		double curr_tStep = 0;
		double tStep = app_tTotal/((double) MAX_TIMESTEPS);
		// calculate timesteps values
		for(int i=0; i<MAX_TIMESTEPS; i++) {
			curr_tStep += tStep;
			app_timesteps[i] = curr_tStep;
			fail_steals_timeline[i] = 0;
		}
		if(upcxx::global_myrank() == 0) {
			printf(">>> HCPP_APP_EXEC_TIME\t\t= %f seconds\n",app_tTotal);
		}
	}
}
コード例 #14
0
int benchmark(size_t *i, double *time, size_t *count, double preferred_time)
{
    double timediff, now;
    ++*i;
    if (*i < *count) {
        return 1;
    }
    now = mysecond();
    timediff = now - *time;
    if (timediff < preferred_time) {
        /* if it's too short, double the number of repeats */
        *i = 0;
        *time = now;
        *count *= 2;
        return 1;
    }
    *time = timediff / *count;
    return 0;
}
コード例 #15
0
ファイル: timing.cpp プロジェクト: EBone/Faust
void startTiming(const char* msg)
{
    // timing
    gTimingLog = (getenv("FAUST_TIMING")) ? new ofstream("FAUST_TIMING_LOG", ios::app) : NULL;
    if (gTimingLog) {
        *gTimingLog << endl;
    }
    
    if (gTimingSwitch) {
        faustassert(gTimingIndex < 1023);
        if (gTimingLog) {
            tab(gTimingIndex, *gTimingLog);
            *gTimingLog << "start " << msg << endl;
        } else {
            tab(gTimingIndex, cerr);
            cerr << "start " << msg << endl;
        }
        gStartTime[gTimingIndex++] = mysecond();
    }
}
コード例 #16
0
int fullbenchmark(struct fullbenchmark *self)
{
    if (self->first) {
        self->first = 0;
        goto first;
    }
inner:
    if (benchmark(&self->subrepeat_index, &self->time,
                  &self->num_subrepeats, PREFERRED_TIME)) {
        return 1;
    }
    statistics_update(&self->stats, self->time);
    ++self->repeat_index;
first:
    if (self->repeat_index < self->num_repeats) {
        self->subrepeat_index = (size_t)(-1);
        self->time = mysecond();
        goto inner;
    }
    return 0;
}
コード例 #17
0
void record_failedSteal_timeline() {
	total_failed_steals++;
	if(app_total_time_estimate) {
		const double tDuration = ((double)(mysecond()-app_tStart))/1000000;
		bool found = false;
		for(int i=0; i<MAX_TIMESTEPS; i++) {
			if(tDuration < app_timesteps[i]) {
				fail_steals_timeline[i]++;
				found = true;
				break;
			}
		}
		if(!found) {
			/*
			 * This will execute for those cases the total estimate for
			 * execution timing of this app has exceeded. If its taking
			 * more time to finish than estimated then we simly add this
			 * in the last timestep.
			 */
			fail_steals_timeline[MAX_TIMESTEPS-1]++;
		}
	}
}
コード例 #18
0
ファイル: stream.c プロジェクト: pousa/minas
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

#ifdef MAI
    mai_init(NULL);
    a = mai_alloc_1D(N, sizeof(double),DOUBLE);
    b = mai_alloc_1D(N, sizeof(double),DOUBLE);
    c = mai_alloc_1D(N, sizeof(double),DOUBLE);
    mai_bind_columns(a);
    mai_bind_columns(b);
    mai_bind_columns(c);
#else
    a = malloc(N*sizeof(double));
    b = malloc(N*sizeof(double));
    c = malloc(N*sizeof(double));
#endif

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);

    printf("Total memory required = %.1f MB.\n",
	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

#ifdef MAI
    mai_cyclic(a);
    mai_cyclic(b);
    mai_cyclic(c);
#endif

    int chunk = 128;

    t = mysecond();
#pragma omp parallel for schedule(dynamic,chunk)
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for  schedule(dynamic,chunk)
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
extern "C" ocrGuid_t mainEdt(u32 paramc, u64* paramv, u32 depc, ocrEdtDep_t depv[]) {
	u64 argc = getArgc(depv[0].ptr);
	int i;
	char *argv[argc];
	for(i=0;i<argc;i++) {
		argv[i] = getArgv(depv[0].ptr,i);
	}

	u64 N;
	u64 iterations;
	bool verify;
	bool verbose;
	bool printResults;
	u64 serialBlockSize = SERIAL_BLOCK_SIZE_DEFAULT;
	if(!parseOptions(argc,argv,&N,&verify,&iterations,&verbose,&printResults,&serialBlockSize)) {
		printHelp(argv,true);
		ocrShutdown();
		return NULL_GUID;
	}
	if(verbose) {
		for(i=0;i<argc;i++) {
			PRINTF("argv[%d]: %s\n",i,argv[i]);
		}
	}
	if(iterations > 1 && verbose) {
		PRINTF("Running %d iterations\n",iterations);
	}

	ocrGuid_t startTempGuid,endTempGuid,printTempGuid,endSlaveTempGuid,iterationTempGuid;
	ocrEdtTemplateCreate(&iterationTempGuid, &fftIterationEdt, 7, 4);
	ocrEdtTemplateCreate(&startTempGuid, &fftStartEdt, 9, 3);
	ocrEdtTemplateCreate(&endTempGuid, &fftEndEdt, 9, 5);
	ocrEdtTemplateCreate(&endSlaveTempGuid, &fftEndSlaveEdt, 5, 3);
	ocrEdtTemplateCreate(&printTempGuid, &finalPrintEdt, 3, 5);
	
	float *x_in;
	// Output for the FFT
	float *X_real;
	float *X_imag;
	ocrGuid_t dataInGuid,dataRealGuid,dataImagGuid,timeDataGuid;
	// TODO: OCR cannot handle large datablocks
	DBCREATE(&dataInGuid, (void **) &x_in, sizeof(float) * N, 0, NULL_GUID, NO_ALLOC);
	DBCREATE(&dataRealGuid, (void **) &X_real, sizeof(float) * N, 0, NULL_GUID, NO_ALLOC);
	DBCREATE(&dataImagGuid, (void **) &X_imag, sizeof(float) * N, 0, NULL_GUID, NO_ALLOC);
	if(verbose) {
		PRINTF("3 Datablocks of size %lu (N=%lu) created\n",sizeof(float)*N,N);
	}
	
	for(i=0;i<N;i++) {
		x_in[i] = 0;
		X_real[i] = 0;
		X_imag[i] = 0;
	}
	x_in[1] = 1;
	//x_in[3] = -1;
	//x_in[5] = 1;
	//x_in[7] = -1;

	
	// Create an EDT out of the EDT template
	ocrGuid_t edtGuid, edtPrevGuid, printEdtGuid, edtEventGuid, edtPrevEventGuid;
	//ocrEdtCreate(&edtGuid, startTempGuid, EDT_PARAM_DEF, edtParamv, EDT_PARAM_DEF, NULL_GUID, EDT_PROP_FINISH, NULL_GUID, &edtEventGuid);

	std::stack<ocrGuid_t> edtStack;
	std::stack<ocrGuid_t> eventStack;
	edtEventGuid = NULL_GUID;
	edtPrevEventGuid = NULL_GUID;

	for(i=1;i<=iterations;i++) {
		u64 edtParamv[7] = { startTempGuid, endTempGuid, endSlaveTempGuid, N, verbose, serialBlockSize, i };
		ocrEdtCreate(&edtGuid, iterationTempGuid, EDT_PARAM_DEF, edtParamv, EDT_PARAM_DEF, NULL_GUID, EDT_PROP_FINISH, NULL_GUID, &edtEventGuid);
		edtStack.push(edtGuid);
		eventStack.push(edtEventGuid);
	}

	edtEventGuid = eventStack.top();
	if(verify) {
		edtEventGuid = setUpVerify(dataInGuid, dataRealGuid, dataImagGuid, N, edtEventGuid);
	}
	double *startTime;
	DBCREATE(&timeDataGuid, (void **) &startTime, sizeof(double), 0, NULL_GUID, NO_ALLOC);
	*startTime = mysecond();
	u64 edtParamv[3] = { N, verbose, printResults };
	// Create finish EDT, with dependence on last EDT
	ocrGuid_t finishDependencies[5] = { edtEventGuid, dataInGuid, dataRealGuid, dataImagGuid, timeDataGuid };
	ocrEdtCreate(&printEdtGuid, printTempGuid, EDT_PARAM_DEF, edtParamv, EDT_PARAM_DEF, finishDependencies, EDT_PROP_NONE, NULL_GUID, NULL);
	eventStack.pop();	

	while(!edtStack.empty()) {
		edtGuid = edtStack.top();
		if(!eventStack.empty()) {
			edtEventGuid = eventStack.top();
		} else {
			edtEventGuid = NULL_GUID;
		}
		ocrAddDependence(dataInGuid, edtGuid, 0, DB_MODE_RO);
		ocrAddDependence(dataRealGuid, edtGuid, 1, DB_MODE_ITW);
		ocrAddDependence(dataImagGuid, edtGuid, 2, DB_MODE_ITW);
		ocrAddDependence(edtEventGuid, edtGuid, 3, DB_MODE_RO);
		edtStack.pop();
		eventStack.pop();
	}

	return NULL_GUID;
}
コード例 #20
0
ファイル: stream.c プロジェクト: ecylmz/docker-rkt-kvm-comp
int
HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs,
  double *triadGBs, int *failure) {
    int   quantum;
    int   BytesPerWord;
    register int j, k;
    double  scalar, t, times[4][NTIMES];
    FILE *outFile;
    double GiBs = 1073741824.0, curGBs;

    if (doIO) {
      // outFile = fopen( params->outFname, "w+" );
	  outFile = stdout;
      if (! outFile) {
        outFile = stderr;
        fprintf( outFile, "Cannot open output file.\n" );
        return 1;
      }
    }

    // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */
	// HARDCODED VectorSize
    // params->StreamVectorSize = VectorSize;

    a = HPCC_XMALLOC( double, VectorSize );
    b = HPCC_XMALLOC( double, VectorSize );
    c = HPCC_XMALLOC( double, VectorSize );

    if (!a || !b || !c) {
      if (c) HPCC_free(c);
      if (b) HPCC_free(b);
      if (a) HPCC_free(a);
      if (doIO) {
        fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize );
        fflush( outFile );
        fclose( outFile );
      }
      return 1;
    }

    /* --- SETUP --- determine precision and check timing --- */

    if (doIO) {
    fprintf (outFile, "Generated on %s\n", params->nowASCII);
    fprintf( outFile, HLINE);
    BytesPerWord = sizeof(double);
    fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n",
             BytesPerWord);

    fprintf( outFile, HLINE);
    fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET);
    fprintf( outFile, "Total memory required = %.4f GiB.\n",
             (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs));
    fprintf( outFile, "Each test is run %d times, but only\n", NTIMES);
    fprintf( outFile, "the *best* time for each is used.\n");
	fflush ( outFile);
    }

#ifdef _OPENMP
    if (doIO) fprintf( outFile, HLINE);
#pragma omp parallel private(k)
    {
#pragma omp single nowait
      {
        k = omp_get_num_threads();
        if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k);
        params->StreamThreads = k;
      }
    }
#endif

    /* Get initial value for system clock. */
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j=0; j<VectorSize; j++) {
      a[j] = 1.0;
      b[j] = 2.0;
      c[j] = 0.0;
    }

    if (doIO) fprintf( outFile, HLINE);

    if  ( (quantum = checktick()) >= 1) {
      if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be "
                         "%d microseconds.\n", quantum);
    } else {
      if (doIO) fprintf( outFile, "Your clock granularity appears to be "
                         "less than one microsecond.\n");
    }

    t = mysecond();
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (j = 0; j < VectorSize; j++)
      a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    if (doIO) {
    fprintf( outFile, "Each test below will take on the order"
             " of %d microseconds.\n", (int) t  );
    fprintf( outFile, "   (= %d clock ticks)\n", (int) (t/quantum) );
    fprintf( outFile, "Increase the size of the arrays if this shows that\n");
    fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n");

    fprintf( outFile, HLINE);

    fprintf( outFile, "WARNING -- The above is only a rough guideline.\n");
    fprintf( outFile, "For best results, please be sure you know the\n");
    fprintf( outFile, "precision of your system timer.\n");
    fprintf( outFile, HLINE);
    }

    /* --- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
    {
      times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j];
#endif
        times[0][k] = mysecond() - times[0][k];

        times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          b[j] = scalar*c[j];
#endif
        times[1][k] = mysecond() - times[1][k];

        times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          c[j] = a[j]+b[j];
#endif
        times[2][k] = mysecond() - times[2][k];

        times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (j=0; j<VectorSize; j++)
          a[j] = b[j]+scalar*c[j];
#endif
        times[3][k] = mysecond() - times[3][k];
    }

    /* --- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
    {
      for (j=0; j<4; j++)
      {
        avgtime[j] = avgtime[j] + times[j][k];
        mintime[j] = Mmin(mintime[j], times[j][k]);
        maxtime[j] = Mmax(maxtime[j], times[j][k]);
      }
    }

    if (doIO)
    fprintf( outFile, "Function      Rate (GB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
      avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */

      /* make sure no division by zero */
      curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0);
      curGBs *= 1e-9 * bytes[j] * VectorSize;
        if (doIO)
          fprintf( outFile, "%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
                   curGBs,
                   avgtime[j],
                   mintime[j],
                   maxtime[j]);
        switch (j) {
          case 0: *copyGBs = curGBs; break;
          case 1: *scaleGBs = curGBs; break;
          case 2: *addGBs = curGBs; break;
          case 3: *triadGBs = curGBs; break;
        }
    }
    if (doIO) fprintf( outFile, HLINE);

    /* --- Check Results --- */
    checkSTREAMresults( outFile, doIO, failure );
    if (doIO) fprintf( outFile, HLINE);

    HPCC_free(c);
    HPCC_free(b);
    HPCC_free(a);

    if (doIO) {
      fflush( outFile );
      fclose( outFile );
    }

    return 0;
}
コード例 #21
0
void start_finish_spmd_timer() {
	finish_spmd_start = mysecond();
}
コード例 #22
0
int main(int argc, const char ** argv)
{
	ArgumentParser parser(argc,argv);

	// get the number of threads
	int nthreads = 0;
	#pragma omp parallel
	{
	#pragma omp atomic
	nthreads += 1;
	}
	
	// getting parameters
	int iSize = (int)parser("-size").asDouble(1.e8);
	int iIteration = (int)parser("-iterations").asDouble(10.);
	
	// running benchmarks
	double * timeHOI = new double[iIteration];
	map<string, vector<double> > peakPerformance;

	double * s = new double;

	// initialize value for the polynomial evaluation
	s[0] = 1e-6;
	
	// run the benchmark iIteration times
	for (int i=0; i<iIteration; i++)
	{
		#pragma omp parallel
		{
		ComputePower(s,iSize);
		}
	}

	for (int i=0; i<iIteration; i++)
	{
		//double * s = new double;
		//double s;
	
		timeHOI[i] = mysecond();
		#pragma omp parallel
		{
		ComputePower(s,iSize);
		}
		timeHOI[i] = mysecond() - timeHOI[i];
	}
	
	cout << "\nSummary\n";
	
	// compute performance of each benchmark run
	for (int i=0; i<iIteration; i++)
		peakPerformance["HOI"].push_back(1.e-9*(double)iSize*4*2*8*nthreads / timeHOI[i]);
	
	sort(peakPerformance["HOI"].begin(), peakPerformance["HOI"].end());
	
	// percentiles to be selected
	const int I1 = iIteration*.1;
	const int I2 = iIteration*.5;
	const int I3 = iIteration*.9;
	
	// output 10th, 50th, 90th percentiles
	cout << I1 << " Ranked Peak Performance\t" << I2 << " Ranked Peak Performance (median)\t" << I3 << " Ranked Peak Performance\n";
	cout << peakPerformance["HOI"][I1] << " GFLOP/s\t" << peakPerformance["HOI"][I2] << " GFLOP/s\t" << peakPerformance["HOI"][I3] << " GFLOP/s\n";
}
コード例 #23
0
ファイル: stream.cpp プロジェクト: andreashappe/IncludeOS
int
main()
{
    int     checktick(void);
    int			quantum;
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    printf("STREAM version $Revision: 5.10 $\n");
    printf(HLINE);
    BytesPerWord = sizeof(STREAM_TYPE);
    printf("This system uses %d bytes per array element.\n",
	BytesPerWord);

    printf(HLINE);
#ifdef N
    printf("*****  WARNING: ******\n");
    printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
    printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
    printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
    printf("*****  WARNING: ******\n");
#endif

    printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
    printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
    printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
    printf("Each kernel will be executed %d times.\n", NTIMES);
    printf(" The *best* time for each kernel (excluding the first iteration)\n"); 
    printf(" will be used to compute the reported bandwidth.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
	    printf ("Number of Threads requested = %i\n",k);
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
    printf ("Number of Threads counted = %i\n",k);
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) 
	printf("Your clock granularity/precision appears to be "
	    "%d microseconds.\n", quantum);
    else {
	printf("Your clock granularity appears to be "
	    "less than one microsecond.\n");
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    printf("Each test below will take on the order"
	" of %d microseconds.\n", (int) t  );
    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function    Best Rate MB/s  Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
コード例 #24
0
ファイル: ex2.c プロジェクト: feelpp/debian-petsc
int main(int argc,char *argv[])
{
  PetscErrorCode ierr;
  int            quantum, checktick();
  int            BytesPerWord;
  int            j, k;
  double         scalar=3.0, t, times[4][NTIMES];

  PetscInitialize(&argc,&argv,0,help);
  /* --- SETUP --- determine precision and check timing --- */

  /*printf(HLINE);
    printf("STREAM version $Revision: 5.9 $\n");
    printf(HLINE); */
  BytesPerWord = sizeof(double);
  printf("This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord);

  printf(HLINE);
#if defined(NO_LONG_LONG)
  printf("Array size = %d, Offset = %d\n", N, OFFSET);
#else
  printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET);
#endif

  printf("Total memory required = %.1f MB.\n",(3.0 * BytesPerWord) * ((double)N / 1048576.0));
  printf("Each test is run %d times, but only\n", NTIMES);
  printf("the *best* time for each is used.\n");

  printf(HLINE);

#if !STATIC_ALLOC
  a = malloc((N+OFFSET)*sizeof(double));
  b = malloc((N+OFFSET)*sizeof(double));
  c = malloc((N+OFFSET)*sizeof(double));
#endif

#if WITH_PTHREADS
  ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nworkThreads);CHKERRQ(ierr);
  ierr = PetscMalloc((nworkThreads+1)*sizeof(PetscInt),&trstarts);CHKERRQ(ierr);
  PetscInt  Q,R,nloc;
  PetscBool S;
  Q           = (N+OFFSET)/nworkThreads;
  R           = (N+OFFSET) - Q*nworkThreads;
  trstarts[0] = 0;
  for (j=0; j < nworkThreads; j++) {
    S             = (PetscBool)(j < R);
    nloc          = S ? Q+1 : Q;
    trstarts[j+1] = trstarts[j]+nloc;
  }
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Initialize_Kernel,1,&scalar);CHKERRQ(ierr);
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
# else
  for (j=0; j<N; j++) {
    a[j] = 1.0;
    b[j] = 2.0;
    c[j] = 0.0;
  }
#endif

  /*printf(HLINE);*/

  /* Get initial value for system clock. */
  if  ((quantum = checktick()) >= 1) ;
  /*      printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */
  else quantum = 1;
  /*   printf("Your clock granularity appears to be less than one microsecond.\n"); */

  t = mysecond();

#if WITH_PTHREADS
  ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_2A_Kernel,0);CHKERRQ(ierr);
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
  for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j];
#endif
  t = 1.0E6 * (mysecond() - t);

  /*    printf("Each test below will take on the order of %d microseconds.\n", (int)t);
  printf("   (= %d clock ticks)\n", (int) (t/quantum));
  printf("Increase the size of the arrays if this shows that\n");
  printf("you are not getting at least 20 clock ticks per test.\n");

  printf(HLINE);
  */
  /*  --- MAIN LOOP --- repeat test cases NTIMES times --- */

  for (k=0; k<NTIMES; k++) {
    times[0][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Copy_Kernel,0);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) c[j] = a[j];
#endif
    times[0][k] = mysecond() - times[0][k];

    times[1][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Scale_Kernel,1,&scalar);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) b[j] = scalar*c[j];
#endif
    times[1][k] = mysecond() - times[1][k];

    times[2][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Add_Kernel,0);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) c[j] = a[j]+b[j];
#endif
    times[2][k] = mysecond() - times[2][k];

    times[3][k] = mysecond();
#if WITH_PTHREADS
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Triad_Kernel,1,&scalar);CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#else
    for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j];
#endif
    times[3][k] = mysecond() - times[3][k];
  }

  /*  --- SUMMARY --- */

  for (k=1; k<NTIMES; k++)     /* note -- skip first iteration */
    for (j=0; j<4; j++) {
      avgtime[j] = avgtime[j] + times[j][k];
      mintime[j] = MIN(mintime[j], times[j][k]);
      maxtime[j] = MAX(maxtime[j], times[j][k]);
    }

  printf("Function      Rate (MB/s) \n");
  for (j=0; j<4; j++) {
    avgtime[j] = avgtime[j]/(double)(NTIMES-1);

    printf("%s%11.4f  \n", label[j], 1.0E-06 * bytes[j]/mintime[j]);
  }
  /* printf(HLINE);*/
#if WITH_PTHREADS
  ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr);
#endif
  /* --- Check Results --- */
  checkSTREAMresults();
  /*    printf(HLINE);*/
  PetscFinalize();
  return 0;
}
コード例 #25
0
ファイル: stream_omp.c プロジェクト: HerbertJordan/insieme
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    register int	j, k;
    double		scalar, t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    printf(HLINE);
    BytesPerWord = sizeof(double);
    printf("This system uses %d bytes per DOUBLE PRECISION word.\n",
	BytesPerWord);

    printf(HLINE);
    printf("Array size = %d, Offset = %d\n" , N, OFFSET);
    printf("Total memory required = %.1f MB.\n",
	(3.0 * BytesPerWord) * ( (double) N / 1048576.0));
    printf("Each test is run %d times, but only\n", NTIMES);
    printf("the *best* time for each is used.\n");

#ifdef _OPENMP
    printf(HLINE);
#pragma omp parallel private(k)
    {
//    k = omp_get_num_threads();
  //  printf ("Number of Threads requested = %i\n",k);
    }
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<N; j++) {
	a[j] = 1.0;
	b[j] = 2.0;
	c[j] = 0.0;
	}

    printf(HLINE);

    if  ( (quantum = checktick()) >= 1) {
//	printf("Your clock granularity/precision appears to be "
//	    "%d microseconds.\n", quantum);
    } else {
//	printf("Your clock granularity appears to be "
//	    "less than one microsecond.\n");
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < N; j++)
	a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

//    printf("Each test below will take on the order"
//	" of %d microseconds.\n", (int) t  );
//    printf("   (= %d clock ticks)\n", (int) (t/quantum) );
    printf("Increase the size of the arrays if this shows that\n");
    printf("you are not getting at least 20 clock ticks per test.\n");

    printf(HLINE);

    printf("WARNING -- The above is only a rough guideline.\n");
    printf("For best results, please be sure you know the\n");
    printf("precision of your system timer.\n");
    printf(HLINE);
    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<N; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    printf("Function      Rate (MB/s)   Avg time     Min time     Max time\n");
    for (j=0; j<4; j++) {
	avgtime[j] = avgtime[j]/(double)(NTIMES-1);

/*	printf("%s%11.4f  %11.4f  %11.4f  %11.4f\n", label[j],
	       1.0E-06 * bytes[j]/mintime[j],
	       avgtime[j],
	       mintime[j],
	       maxtime[j]);*/
    }
    printf(HLINE);

    /* --- Check Results --- */
    checkSTREAMresults();
    printf(HLINE);

    return 0;
}
コード例 #26
0
void end_finish_spmd_timer() {
	finish_spmd_duration += (((double)(mysecond()-finish_spmd_start))/1000000) * 1000; //msec
}
コード例 #27
0
void end_time()
{
	secs = mysecond() - secs;
}
コード例 #28
0
ファイル: cg_test.c プロジェクト: shamouda/x10-applications
int main(int argc,char** argv)
{

	int myrank=0,nprocs=1;
	int latsize[4],localsize[4];
	int netSize[16],netPos[16],netDim;
	int i,j,t,npIn,nsite;
	int Niter = QCD_NITER;
	QCDSpinor* pSrc;
	QCDSpinor* pDest;
	QCDMatrix* pGauge;

	QCDReal Enorm = QCD_ENORM;
	QCDReal Cks = QCD_CKS;
	QCDReal* pCorr;
	double tstart,tend,ttotal;
	char* pStr;
	int ItimeS,NtimeS,ics,ids,is,ie,ipet,it,Nconv,cnt;
	double CorrF,Diff,rr;
	unsigned long flops;
	double tt;

	latsize[0] = 0;
	latsize[1] = 0;
	latsize[2] = 0;
	latsize[3] = 0;

	netDim = 4;
	netSize[0] = 0;
	netSize[1] = 0;
	netSize[2] = 0;
	netSize[3] = 0;

	for(i=1;i<argc;i++){
		if(argv[i][0] == 'L'){
			t = 0;
			for(j=1;j<strlen(argv[i]);j++){
				if(argv[i][j] == 'x'){
					t++;
				}
				else if(argv[i][j] >= '0' && argv[i][j] <= '9'){
					latsize[t] = 10*latsize[t] + (int)(argv[i][j] - '0');
				}
			}
		}
		else if(argv[i][0] == 'P'){
			t = 0;
			for(j=1;j<strlen(argv[i]);j++){
				if(argv[i][j] == 'x'){
					t++;
				}
				else if(argv[i][j] >= '0' && argv[i][j] <= '9'){
					netSize[t] = 10*netSize[t] + (int)(argv[i][j] - '0');
				}
			}
		}
	}

	t = 0;
	for(i=0;i<4;i++){
		if(latsize[0] == 0){
			t++;
		}
	}
	if(t > 0){
		latsize[0] = QCD_NX;
		latsize[1] = QCD_NY;
		latsize[2] = QCD_NZ;
		latsize[3] = QCD_NT;
	}

	MPI_Init(&argc,&argv);

	MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
	MPI_Comm_rank(MPI_COMM_WORLD,&myrank);

	npIn = 1;
	for(i=0;i<4;i++){
		npIn *= netSize[i];

		//debug
		/* printf("netSize[%d] == %d\n", i, netSize[i]); */

	}
	if(npIn != nprocs){
		if(myrank == 0){
			printf("Number of processes is invalid\n");
		}
		return 0;
	}

	nsite = 1;
	for(i=0;i<4;i++){
		localsize[i] = latsize[i] / netSize[i];
		nsite *= localsize[i];
	}

	t = myrank;
	for(i=0;i<4;i++){
		netPos[i] = t % netSize[i];
		t /= netSize[i];
	}

	QCDDopr_Init(localsize[0],localsize[1],localsize[2],localsize[3],netSize[0],netSize[1],netSize[2],netSize[3],myrank);

	if(myrank == 0){
		printf("=============================================\n");
		printf("QCD base MPI program\n");
		printf("          Lattice size = %dx%dx%dx%d\n",latsize[0],latsize[1],latsize[2],latsize[3]);
		printf("Decomposed by %d procs : %dx%dx%dx%d\n",nprocs,netSize[0],netSize[1],netSize[2],netSize[3]);
		printf("    Local Lattice size = %dx%dx%dx%d\n",localsize[0],localsize[1],localsize[2],localsize[3]);
		printf("\n Cks = %f\n",Cks);
		printf("=============================================\n");
	}

	//debug
	/* printf("xxx\n"); */

	pGauge = (QCDMatrix*)malloc(sizeof(QCDMatrix) * 4 * nsite + 512);
	uinit((double*)pGauge,latsize[0],latsize[1],latsize[2],latsize[3]);

	//debug
	/* printf("xxx\n"); */

	pSrc = (QCDSpinor*)malloc(sizeof(QCDSpinor) * nsite + 128);
	pDest = (QCDSpinor*)malloc(sizeof(QCDSpinor) * nsite + 128);

	pCorr = (QCDReal*)malloc(sizeof(QCDReal) * latsize[3]);
	for(i=0;i<latsize[3];i++){
		pCorr[i] = 0.0;
	}

	ttotal = 0.0;
	/* for(ics=0;ics<QCD_NCOL;ics++){ */
	/* 	for(ids=0;ids<QCD_ND;ids++){ */
	for(ics=0;ics<1;ics++){
		for(ids=0;ids<1;ids++){
			set_src(ids,ics,pSrc,0);

			MPI_Barrier(MPI_COMM_WORLD);
			tstart = mysecond();
			Solve_CG(pDest,pGauge,pSrc,Cks,Enorm,&Nconv,&Diff);
			MPI_Barrier(MPI_COMM_WORLD);
			tend = mysecond() - tstart;
			ttotal += tend;

			if(myrank == 0){
				printf(" %3d %3d  %6d %12.4e ... %f sec\n", ics, ids, Nconv, Diff,tend);
			}

			for(i=0;i<latsize[3];i++){
				ipet = i/localsize[3];
				it = i % localsize[3];
				if(ipet == netPos[3]){
					is = it*localsize[0]*localsize[1]*localsize[2];
					QCDLA_Norm(&CorrF,pDest + is,localsize[0]*localsize[1]*localsize[2]);
				}
				else{
					CorrF = 0.0;
				}
				MPI_Allreduce(&CorrF,&rr,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD);
				pCorr[i] = pCorr[i] + rr;
			}
		}
    }

	if(myrank == 0){

		printf("\nPs meson correlator:\n");
		for(i=0;i<latsize[3];i++){
			printf("%d: %0.8E\n",i,pCorr[i]);
		}

		printf("\n Avg. Solver Time = %f [sec]\n",ttotal / 12);
	}

	MPI_Barrier(MPI_COMM_WORLD);


	//debug
	/* printf("finish\n"); */


	return 0;
}
コード例 #29
0
void start_time()
{
	secs = mysecond();
}
コード例 #30
0
int
main()
    {
    int			quantum, checktick();
    int			BytesPerWord;
    int			k;
    ssize_t		j;
    STREAM_TYPE		scalar;
    double		t, times[4][NTIMES];

    /* --- SETUP --- determine precision and check timing --- */

    BytesPerWord = sizeof(STREAM_TYPE);

#ifdef _OPENMP
#pragma omp parallel 
    {
#pragma omp master
	{
	    k = omp_get_num_threads();
        }
    }
#endif

#ifdef _OPENMP
	k = 0;
#pragma omp parallel
#pragma omp atomic 
		k++;
#endif

    /* Get initial value for system clock. */
#pragma omp parallel for
    for (j=0; j<STREAM_ARRAY_SIZE; j++) {
	    a[j] = 1.0;
	    b[j] = 2.0;
	    c[j] = 0.0;
	}


    if  ( (quantum = checktick()) >= 1) 
	printf("");
    else {
	quantum = 1;
    }

    t = mysecond();
#pragma omp parallel for
    for (j = 0; j < STREAM_ARRAY_SIZE; j++)
		a[j] = 2.0E0 * a[j];
    t = 1.0E6 * (mysecond() - t);

    
    /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */

    scalar = 3.0;
    for (k=0; k<NTIMES; k++)
	{
	times[0][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Copy();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j];
#endif
	times[0][k] = mysecond() - times[0][k];
	
	times[1][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Scale(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    b[j] = scalar*c[j];
#endif
	times[1][k] = mysecond() - times[1][k];
	
	times[2][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Add();
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    c[j] = a[j]+b[j];
#endif
	times[2][k] = mysecond() - times[2][k];
	
	times[3][k] = mysecond();
#ifdef TUNED
        tuned_STREAM_Triad(scalar);
#else
#pragma omp parallel for
	for (j=0; j<STREAM_ARRAY_SIZE; j++)
	    a[j] = b[j]+scalar*c[j];
#endif
	times[3][k] = mysecond() - times[3][k];
	}

    /*	--- SUMMARY --- */

    for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
	{
	for (j=0; j<4; j++)
	    {
	    avgtime[j] = avgtime[j] + times[j][k];
	    mintime[j] = MIN(mintime[j], times[j][k]);
	    maxtime[j] = MAX(maxtime[j], times[j][k]);
	    }
	}
    
    for (j=0; j<1; j++) {
		avgtime[j] = avgtime[j]/(double)(NTIMES-1);

		printf("%11.6f\n", 1.0E-06 * bytes[j]/mintime[j]);
    }

    /* --- Check Results --- */
    checkSTREAMresults();

    return 0;
}