static int checktick() { int i, minDelta, Delta; double t1, t2, timesfound[M]; /* Collect a sequence of M unique time values from the system. */ for (i = 0; i < M; i++) { t1 = mysecond(); while( ((t2=mysecond()) - t1) < 1.0E-6 ) ; timesfound[i] = t1 = t2; } /* * Determine the minimum difference between these M values. * This result will be our estimate (in microseconds) for the * clock granularity. */ minDelta = 1000000; for (i = 1; i < M; i++) { Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1])); minDelta = Mmin(minDelta, Mmax(Delta,0)); } return(minDelta); }
void lu_dependencies( double* M[NB][NB] ) { float t_start,t_end; float time; t_start=mysecond(); int ii, jj, kk; for (kk=0; kk<NB; kk++) { { double *diag = M[kk][kk]; #pragma omp task depend(inout: [BSIZE][BSIZE]diag) lu0(diag); } for (jj=kk+1; jj<NB; jj++) if (M[kk][jj] != NULL) { double *diag = M[kk][kk]; double *col = M[kk][jj]; #pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]col) fwd(diag, col); } for (ii=kk+1; ii<NB; ii++) { if (M[ii][kk] != NULL) { { double *row = M[kk][kk]; double *diag = M[ii][kk]; #pragma omp task depend(in: [BSIZE][BSIZE]diag) depend(inout: [BSIZE][BSIZE]row) bdiv (diag, row); } for (jj=kk+1; jj<NB; jj++) { if (M[kk][jj] != NULL) { if (M[ii][jj]==NULL) M[ii][jj]=allocate_clean_block(); { double *row = M[ii][kk]; double *col = M[kk][jj]; double *inner = M[ii][jj]; #pragma omp task depend(in: [BSIZE][BSIZE]row, [BSIZE][BSIZE]col) depend(inout: [BSIZE][BSIZE]inner) bmod(row, col, inner); } } } } } } #pragma omp taskwait t_end=mysecond(); time = t_end-t_start; printf("Dependencies time to compute = %f usec\n", time); }
void lu_serial( double* M[NB][NB] ) { float t_start,t_end; float time; t_start= mysecond(); int ii, jj, kk; for (kk=0; kk<NB; kk++) { { double *diag = M[kk][kk]; lu0(diag); } for (jj=kk+1; jj<NB; jj++) if (M[kk][jj] != NULL) { double *diag = M[kk][kk]; double *col = M[kk][jj]; fwd(diag, col); } for (ii=kk+1; ii<NB; ii++) { if (M[ii][kk] != NULL) { { double *row = M[kk][kk]; double *diag = M[ii][kk]; bdiv (diag, row); } for (jj=kk+1; jj<NB; jj++) { if (M[kk][jj] != NULL) { if (M[ii][jj]==NULL) M[ii][jj]=allocate_clean_block(); { double *row = M[ii][kk]; double *col = M[kk][jj]; double *inner = M[ii][jj]; bmod(row, col, inner); } } } } } } t_end=mysecond(); time = t_end-t_start; printf("Serial time to compute = %f usec\n", time); }
// Prints the final result of the computation. Called as the last EDT. ocrGuid_t finalPrintEdt(u32 paramc, u64 *paramv, u32 depc, ocrEdtDep_t depv[]) { int i; u64 N = paramv[0]; bool verbose = paramv[1]; bool printResults = paramv[2]; float *data_in = (float*)depv[1].ptr; float *data_real = (float*)depv[2].ptr; float *data_imag = (float*)depv[3].ptr; float *x_in = (float*)data_in; float *X_real = (float*)(data_real); float *X_imag = (float*)(data_imag); double *startTime = (double*)(depv[4].ptr); if(verbose) { PRINTF("Final print EDT\n"); } double endTime = mysecond(); PRINTF("%f\n",endTime-*startTime); if(printResults) { PRINTF("Starting values:\n"); for(i=0;i<N;i++) { PRINTF("%d [ %f ]\n",i,x_in[i]); } PRINTF("\n"); PRINTF("Final result:\n"); for(i=0;i<N;i++) { PRINTF("%d [%f + %fi]\n",i,X_real[i],X_imag[i]); } } ocrShutdown(); }
double time_dgemm_blas(const int M, const unsigned N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc) { double mflops, mflop_s; double secs = -1; int num_iterations = NRUNS; int i; char transa = 'n'; char transb = 'n'; double* Ca = (double*) _mm_malloc(N*ldc*sizeof(double), 32); double cpu_time = 0; for (i = 0; i < num_iterations; ++i) { memcpy(Ca, C, N*ldc*sizeof(double)); cpu_time -= mysecond(); #ifdef PAPI PAPI_START; #endif dgemm_(&transa, &transb, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, Ca, &ldc); #ifdef PAPI PAPI_STOP; PAPI_PRINT; #endif //dgemm (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc); cpu_time += mysecond(); } mflops = 2.0*num_iterations*M*N*K/1.0e6; secs = cpu_time; mflop_s = mflops/secs; memcpy(C, Ca, N*ldc*sizeof(double)); #ifdef PAPI PAPI_FLUSH; #endif _mm_free(Ca); return mflop_s; }
void stream_copy() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, times[4][NTIMES]; k = 0; #ifdef _OPENMP k = omp_get_max_threads(); #endif printf("Modified STREAM COPY, num_threads=%d, array size= %d\n", k, N); #pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #pragma omp parallel for for (j=0; j<N; j++) c[j] = a[j]; times[0][k] = mysecond() - times[0][k]; } for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { j=0; avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } printf("Function Rate (MB/s) Avg time Min time Max time\n"); { j=0; avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } }
void showStatsFooter() { HASSERT(benchmark_start_time_stats != 0); double dur = (((double)(mysecond()-benchmark_start_time_stats))/1000000) * 1000; //msec if(upcxx::global_myrank() == 0) { print_topology_information(); } runtime_statistics(dur); }
int main(int argc, char **argv) { int i, j; const int n=1000, m=1000; double a[n][m]; double t1,t2; t1=mysecond(); for ( i=0; i<1000; ++i) { for ( j=0; j<1000; ++j) { a[i][j] = i+j; } } t2=mysecond(); printf("time used %g\n", t2-t1); return 0; }
int main(int argc, char *argv[]) { double sum, tStart, tEnd, tLoop, rate, t; int i, j, k, tests; /* Initialize the matrics */ /* Note that this is *not* in the best order with respect to cache; this will be discussed later in the course. */ for (i=0; i<matSize; i++) for (j=0; j<matSize; j++) { matA[ind(i,j)] = 1.0 + i; matB[ind(i,j)] = 1.0 + j; matC[ind(i,j)] = 0.0; } tLoop = 1.0e10; for (tests=0; tests<maxTest; tests++) { tStart = mysecond(); for (i=0; i<matSize; i++) for (j=0; j<matSize; j++) { sum = 0.0; for (k=0; k<matSize; k++) sum += matA[ind(i,k)] * matB[ind(k,j)]; matC[ind(i,j)] = sum; } tEnd = mysecond(); t = tEnd - tStart; dummy(matA, matB, matC); if (t < tLoop) tLoop = t; if (matC[ind(0,0)] < 0) { fprintf(stderr, "Failed matC sign test\n"); } } /* Note that explicit formats are used to limit the number of significant digits printed (at most this many digits are significant) */ printf("Matrix size = %d\n", matSize); printf("Time = %.2e secs\n", tLoop); rate = (2.0 * matSize) * matSize * (matSize / tLoop); printf("Rate = %.2e MFLOP/s\n", rate * 1.0e-6); return 0; }
int main(int argc, char *argv[]) { /* Initialize random number generator */ srand((int) time(&q)); // Fill array with random numbers for (i = 0; i < m; ++i) {//row for (j = 0; j < n; ++j) { arrayOne[i][j] = rand() % (m * n); } } tLoop = 1.0e10; for (tests = 0; tests < maxTest; tests++) { tStart = mysecond(); // start timing outer loop // Perform the Transpose for (i = 0; i < m; i+=block) { for (j = 0; j < n; j+=block) { for (ii = i; ii < min(i+block-1,m); ii++) { for (jj = j; jj < min(j+block-1,n); jj++) { arrayTwo[ii][jj] = arrayOne[jj][ii]; } } } } // tEnd = mysecond(); // end timing outer loop t = tEnd - tStart; // compute outer run time if (t < tLoop) tLoop = t; // set tLoop to t, the run time } /* Note that explicit formats are used to limit the number of significant digits printed (at most this many digits are significant) */ printf("Matrix = %d x %d\n", m, n); printf("Time = %.2e secs\n", t); rate = (8.0 * m * n) / (t); printf("Rate = %.2e MB/s\n", rate * 1.0e-6); // * 1.0e-6 return 0; }
void showStatsHeader() { if(upcxx::global_myrank() == 0) { cout << endl; cout << "-----" << endl; cout << "mkdir timedrun fake" << endl; cout << endl; } initialize_hcWorker(); if(upcxx::global_myrank() == 0) { cout << endl; cout << "-----" << endl; } benchmark_start_time_stats = mysecond(); }
void endTiming(const char* msg) { if (gTimingSwitch) { faustassert(gTimingIndex > 0); gEndTime[--gTimingIndex] = mysecond(); if (gTimingLog) { *gTimingLog << msg << "\t" << gEndTime[gTimingIndex] - gStartTime[gTimingIndex] << endl; gTimingLog->flush(); } else { tab(gTimingIndex, cerr); cerr << "end " << msg << " (duration : " << gEndTime[gTimingIndex] - gStartTime[gTimingIndex] << ")" << endl; } } }
void stats_initTimelineEvents() { if(app_total_time_estimate) { assert(app_total_time_estimate != NULL); app_tTotal = atof(app_total_time_estimate); app_tStart = mysecond(); double curr_tStep = 0; double tStep = app_tTotal/((double) MAX_TIMESTEPS); // calculate timesteps values for(int i=0; i<MAX_TIMESTEPS; i++) { curr_tStep += tStep; app_timesteps[i] = curr_tStep; fail_steals_timeline[i] = 0; } if(upcxx::global_myrank() == 0) { printf(">>> HCPP_APP_EXEC_TIME\t\t= %f seconds\n",app_tTotal); } } }
int benchmark(size_t *i, double *time, size_t *count, double preferred_time) { double timediff, now; ++*i; if (*i < *count) { return 1; } now = mysecond(); timediff = now - *time; if (timediff < preferred_time) { /* if it's too short, double the number of repeats */ *i = 0; *time = now; *count *= 2; return 1; } *time = timediff / *count; return 0; }
void startTiming(const char* msg) { // timing gTimingLog = (getenv("FAUST_TIMING")) ? new ofstream("FAUST_TIMING_LOG", ios::app) : NULL; if (gTimingLog) { *gTimingLog << endl; } if (gTimingSwitch) { faustassert(gTimingIndex < 1023); if (gTimingLog) { tab(gTimingIndex, *gTimingLog); *gTimingLog << "start " << msg << endl; } else { tab(gTimingIndex, cerr); cerr << "start " << msg << endl; } gStartTime[gTimingIndex++] = mysecond(); } }
int fullbenchmark(struct fullbenchmark *self) { if (self->first) { self->first = 0; goto first; } inner: if (benchmark(&self->subrepeat_index, &self->time, &self->num_subrepeats, PREFERRED_TIME)) { return 1; } statistics_update(&self->stats, self->time); ++self->repeat_index; first: if (self->repeat_index < self->num_repeats) { self->subrepeat_index = (size_t)(-1); self->time = mysecond(); goto inner; } return 0; }
void record_failedSteal_timeline() { total_failed_steals++; if(app_total_time_estimate) { const double tDuration = ((double)(mysecond()-app_tStart))/1000000; bool found = false; for(int i=0; i<MAX_TIMESTEPS; i++) { if(tDuration < app_timesteps[i]) { fail_steals_timeline[i]++; found = true; break; } } if(!found) { /* * This will execute for those cases the total estimate for * execution timing of this app has exceeded. If its taking * more time to finish than estimated then we simly add this * in the last timestep. */ fail_steals_timeline[MAX_TIMESTEPS-1]++; } } }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; #ifdef MAI mai_init(NULL); a = mai_alloc_1D(N, sizeof(double),DOUBLE); b = mai_alloc_1D(N, sizeof(double),DOUBLE); c = mai_alloc_1D(N, sizeof(double),DOUBLE); mai_bind_columns(a); mai_bind_columns(b); mai_bind_columns(c); #else a = malloc(N*sizeof(double)); b = malloc(N*sizeof(double)); c = malloc(N*sizeof(double)); #endif /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.9 $\n"); printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); #ifdef MAI mai_cyclic(a); mai_cyclic(b); mai_cyclic(c); #endif int chunk = 128; t = mysecond(); #pragma omp parallel for schedule(dynamic,chunk) for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for schedule(dynamic,chunk) for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); return 0; }
extern "C" ocrGuid_t mainEdt(u32 paramc, u64* paramv, u32 depc, ocrEdtDep_t depv[]) { u64 argc = getArgc(depv[0].ptr); int i; char *argv[argc]; for(i=0;i<argc;i++) { argv[i] = getArgv(depv[0].ptr,i); } u64 N; u64 iterations; bool verify; bool verbose; bool printResults; u64 serialBlockSize = SERIAL_BLOCK_SIZE_DEFAULT; if(!parseOptions(argc,argv,&N,&verify,&iterations,&verbose,&printResults,&serialBlockSize)) { printHelp(argv,true); ocrShutdown(); return NULL_GUID; } if(verbose) { for(i=0;i<argc;i++) { PRINTF("argv[%d]: %s\n",i,argv[i]); } } if(iterations > 1 && verbose) { PRINTF("Running %d iterations\n",iterations); } ocrGuid_t startTempGuid,endTempGuid,printTempGuid,endSlaveTempGuid,iterationTempGuid; ocrEdtTemplateCreate(&iterationTempGuid, &fftIterationEdt, 7, 4); ocrEdtTemplateCreate(&startTempGuid, &fftStartEdt, 9, 3); ocrEdtTemplateCreate(&endTempGuid, &fftEndEdt, 9, 5); ocrEdtTemplateCreate(&endSlaveTempGuid, &fftEndSlaveEdt, 5, 3); ocrEdtTemplateCreate(&printTempGuid, &finalPrintEdt, 3, 5); float *x_in; // Output for the FFT float *X_real; float *X_imag; ocrGuid_t dataInGuid,dataRealGuid,dataImagGuid,timeDataGuid; // TODO: OCR cannot handle large datablocks DBCREATE(&dataInGuid, (void **) &x_in, sizeof(float) * N, 0, NULL_GUID, NO_ALLOC); DBCREATE(&dataRealGuid, (void **) &X_real, sizeof(float) * N, 0, NULL_GUID, NO_ALLOC); DBCREATE(&dataImagGuid, (void **) &X_imag, sizeof(float) * N, 0, NULL_GUID, NO_ALLOC); if(verbose) { PRINTF("3 Datablocks of size %lu (N=%lu) created\n",sizeof(float)*N,N); } for(i=0;i<N;i++) { x_in[i] = 0; X_real[i] = 0; X_imag[i] = 0; } x_in[1] = 1; //x_in[3] = -1; //x_in[5] = 1; //x_in[7] = -1; // Create an EDT out of the EDT template ocrGuid_t edtGuid, edtPrevGuid, printEdtGuid, edtEventGuid, edtPrevEventGuid; //ocrEdtCreate(&edtGuid, startTempGuid, EDT_PARAM_DEF, edtParamv, EDT_PARAM_DEF, NULL_GUID, EDT_PROP_FINISH, NULL_GUID, &edtEventGuid); std::stack<ocrGuid_t> edtStack; std::stack<ocrGuid_t> eventStack; edtEventGuid = NULL_GUID; edtPrevEventGuid = NULL_GUID; for(i=1;i<=iterations;i++) { u64 edtParamv[7] = { startTempGuid, endTempGuid, endSlaveTempGuid, N, verbose, serialBlockSize, i }; ocrEdtCreate(&edtGuid, iterationTempGuid, EDT_PARAM_DEF, edtParamv, EDT_PARAM_DEF, NULL_GUID, EDT_PROP_FINISH, NULL_GUID, &edtEventGuid); edtStack.push(edtGuid); eventStack.push(edtEventGuid); } edtEventGuid = eventStack.top(); if(verify) { edtEventGuid = setUpVerify(dataInGuid, dataRealGuid, dataImagGuid, N, edtEventGuid); } double *startTime; DBCREATE(&timeDataGuid, (void **) &startTime, sizeof(double), 0, NULL_GUID, NO_ALLOC); *startTime = mysecond(); u64 edtParamv[3] = { N, verbose, printResults }; // Create finish EDT, with dependence on last EDT ocrGuid_t finishDependencies[5] = { edtEventGuid, dataInGuid, dataRealGuid, dataImagGuid, timeDataGuid }; ocrEdtCreate(&printEdtGuid, printTempGuid, EDT_PARAM_DEF, edtParamv, EDT_PARAM_DEF, finishDependencies, EDT_PROP_NONE, NULL_GUID, NULL); eventStack.pop(); while(!edtStack.empty()) { edtGuid = edtStack.top(); if(!eventStack.empty()) { edtEventGuid = eventStack.top(); } else { edtEventGuid = NULL_GUID; } ocrAddDependence(dataInGuid, edtGuid, 0, DB_MODE_RO); ocrAddDependence(dataRealGuid, edtGuid, 1, DB_MODE_ITW); ocrAddDependence(dataImagGuid, edtGuid, 2, DB_MODE_ITW); ocrAddDependence(edtEventGuid, edtGuid, 3, DB_MODE_RO); edtStack.pop(); eventStack.pop(); } return NULL_GUID; }
int HPCC_Stream(HPCC_Params *params, int doIO, double *copyGBs, double *scaleGBs, double *addGBs, double *triadGBs, int *failure) { int quantum; int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; FILE *outFile; double GiBs = 1073741824.0, curGBs; if (doIO) { // outFile = fopen( params->outFname, "w+" ); outFile = stdout; if (! outFile) { outFile = stderr; fprintf( outFile, "Cannot open output file.\n" ); return 1; } } // VectorSize = HPCC_LocalVectorSize( params, 3, sizeof(double), 0 ); /* Need 3 vectors */ // HARDCODED VectorSize // params->StreamVectorSize = VectorSize; a = HPCC_XMALLOC( double, VectorSize ); b = HPCC_XMALLOC( double, VectorSize ); c = HPCC_XMALLOC( double, VectorSize ); if (!a || !b || !c) { if (c) HPCC_free(c); if (b) HPCC_free(b); if (a) HPCC_free(a); if (doIO) { fprintf( outFile, "Failed to allocate memory (%lu).\n", VectorSize ); fflush( outFile ); fclose( outFile ); } return 1; } /* --- SETUP --- determine precision and check timing --- */ if (doIO) { fprintf (outFile, "Generated on %s\n", params->nowASCII); fprintf( outFile, HLINE); BytesPerWord = sizeof(double); fprintf( outFile, "This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); fprintf( outFile, HLINE); fprintf( outFile, "Array size = %lu, Offset = %d\n" , VectorSize, OFFSET); fprintf( outFile, "Total memory required = %.4f GiB.\n", (3.0 * BytesPerWord) * ( (double) VectorSize / GiBs)); fprintf( outFile, "Each test is run %d times, but only\n", NTIMES); fprintf( outFile, "the *best* time for each is used.\n"); fflush ( outFile); } #ifdef _OPENMP if (doIO) fprintf( outFile, HLINE); #pragma omp parallel private(k) { #pragma omp single nowait { k = omp_get_num_threads(); if (doIO) fprintf( outFile, "Number of Threads requested = %i\n",k); params->StreamThreads = k; } } #endif /* Get initial value for system clock. */ #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } if (doIO) fprintf( outFile, HLINE); if ( (quantum = checktick()) >= 1) { if (doIO) fprintf( outFile, "Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); } else { if (doIO) fprintf( outFile, "Your clock granularity appears to be " "less than one microsecond.\n"); } t = mysecond(); #ifdef _OPENMP #pragma omp parallel for #endif for (j = 0; j < VectorSize; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); if (doIO) { fprintf( outFile, "Each test below will take on the order" " of %d microseconds.\n", (int) t ); fprintf( outFile, " (= %d clock ticks)\n", (int) (t/quantum) ); fprintf( outFile, "Increase the size of the arrays if this shows that\n"); fprintf( outFile, "you are not getting at least 20 clock ticks per test.\n"); fprintf( outFile, HLINE); fprintf( outFile, "WARNING -- The above is only a rough guideline.\n"); fprintf( outFile, "For best results, please be sure you know the\n"); fprintf( outFile, "precision of your system timer.\n"); fprintf( outFile, HLINE); } /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #ifdef _OPENMP #pragma omp parallel for #endif for (j=0; j<VectorSize; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = Mmin(mintime[j], times[j][k]); maxtime[j] = Mmax(maxtime[j], times[j][k]); } } if (doIO) fprintf( outFile, "Function Rate (GB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] /= (double)(NTIMES - 1); /* note -- skip first iteration */ /* make sure no division by zero */ curGBs = (mintime[j] > 0.0 ? 1.0 / mintime[j] : -1.0); curGBs *= 1e-9 * bytes[j] * VectorSize; if (doIO) fprintf( outFile, "%s%11.4f %11.4f %11.4f %11.4f\n", label[j], curGBs, avgtime[j], mintime[j], maxtime[j]); switch (j) { case 0: *copyGBs = curGBs; break; case 1: *scaleGBs = curGBs; break; case 2: *addGBs = curGBs; break; case 3: *triadGBs = curGBs; break; } } if (doIO) fprintf( outFile, HLINE); /* --- Check Results --- */ checkSTREAMresults( outFile, doIO, failure ); if (doIO) fprintf( outFile, HLINE); HPCC_free(c); HPCC_free(b); HPCC_free(a); if (doIO) { fflush( outFile ); fclose( outFile ); } return 0; }
void start_finish_spmd_timer() { finish_spmd_start = mysecond(); }
int main(int argc, const char ** argv) { ArgumentParser parser(argc,argv); // get the number of threads int nthreads = 0; #pragma omp parallel { #pragma omp atomic nthreads += 1; } // getting parameters int iSize = (int)parser("-size").asDouble(1.e8); int iIteration = (int)parser("-iterations").asDouble(10.); // running benchmarks double * timeHOI = new double[iIteration]; map<string, vector<double> > peakPerformance; double * s = new double; // initialize value for the polynomial evaluation s[0] = 1e-6; // run the benchmark iIteration times for (int i=0; i<iIteration; i++) { #pragma omp parallel { ComputePower(s,iSize); } } for (int i=0; i<iIteration; i++) { //double * s = new double; //double s; timeHOI[i] = mysecond(); #pragma omp parallel { ComputePower(s,iSize); } timeHOI[i] = mysecond() - timeHOI[i]; } cout << "\nSummary\n"; // compute performance of each benchmark run for (int i=0; i<iIteration; i++) peakPerformance["HOI"].push_back(1.e-9*(double)iSize*4*2*8*nthreads / timeHOI[i]); sort(peakPerformance["HOI"].begin(), peakPerformance["HOI"].end()); // percentiles to be selected const int I1 = iIteration*.1; const int I2 = iIteration*.5; const int I3 = iIteration*.9; // output 10th, 50th, 90th percentiles cout << I1 << " Ranked Peak Performance\t" << I2 << " Ranked Peak Performance (median)\t" << I3 << " Ranked Peak Performance\n"; cout << peakPerformance["HOI"][I1] << " GFLOP/s\t" << peakPerformance["HOI"][I2] << " GFLOP/s\t" << peakPerformance["HOI"][I3] << " GFLOP/s\n"; }
int main() { int checktick(void); int quantum; int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); printf("STREAM version $Revision: 5.10 $\n"); printf(HLINE); BytesPerWord = sizeof(STREAM_TYPE); printf("This system uses %d bytes per array element.\n", BytesPerWord); printf(HLINE); #ifdef N printf("***** WARNING: ******\n"); printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); printf("***** WARNING: ******\n"); #endif printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); printf("Memory per array = %.1f MiB (= %.1f GiB).\n", BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); printf("Total memory required = %.1f MiB (= %.1f GiB).\n", (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); printf("Each kernel will be executed %d times.\n", NTIMES); printf(" The *best* time for each kernel (excluding the first iteration)\n"); printf(" will be used to compute the reported bandwidth.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; printf ("Number of Threads counted = %i\n",k); #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) printf("Your clock granularity/precision appears to be " "%d microseconds.\n", quantum); else { printf("Your clock granularity appears to be " "less than one microsecond.\n"); quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); printf("Each test below will take on the order" " of %d microseconds.\n", (int) t ); printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Best Rate MB/s Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]); } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); return 0; }
int main(int argc,char *argv[]) { PetscErrorCode ierr; int quantum, checktick(); int BytesPerWord; int j, k; double scalar=3.0, t, times[4][NTIMES]; PetscInitialize(&argc,&argv,0,help); /* --- SETUP --- determine precision and check timing --- */ /*printf(HLINE); printf("STREAM version $Revision: 5.9 $\n"); printf(HLINE); */ BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n",BytesPerWord); printf(HLINE); #if defined(NO_LONG_LONG) printf("Array size = %d, Offset = %d\n", N, OFFSET); #else printf("Array size = %llu, Offset = %d\n", (unsigned long long) N, OFFSET); #endif printf("Total memory required = %.1f MB.\n",(3.0 * BytesPerWord) * ((double)N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); printf(HLINE); #if !STATIC_ALLOC a = malloc((N+OFFSET)*sizeof(double)); b = malloc((N+OFFSET)*sizeof(double)); c = malloc((N+OFFSET)*sizeof(double)); #endif #if WITH_PTHREADS ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&nworkThreads);CHKERRQ(ierr); ierr = PetscMalloc((nworkThreads+1)*sizeof(PetscInt),&trstarts);CHKERRQ(ierr); PetscInt Q,R,nloc; PetscBool S; Q = (N+OFFSET)/nworkThreads; R = (N+OFFSET) - Q*nworkThreads; trstarts[0] = 0; for (j=0; j < nworkThreads; j++) { S = (PetscBool)(j < R); nloc = S ? Q+1 : Q; trstarts[j+1] = trstarts[j]+nloc; } ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Initialize_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); # else for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } #endif /*printf(HLINE);*/ /* Get initial value for system clock. */ if ((quantum = checktick()) >= 1) ; /* printf("Your clock granularity/precision appears to be %d microseconds.\n", quantum); */ else quantum = 1; /* printf("Your clock granularity appears to be less than one microsecond.\n"); */ t = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_2A_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; #endif t = 1.0E6 * (mysecond() - t); /* printf("Each test below will take on the order of %d microseconds.\n", (int)t); printf(" (= %d clock ticks)\n", (int) (t/quantum)); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); */ /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Copy_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Scale_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Add_Kernel,0);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #if WITH_PTHREADS ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)tuned_STREAM_Triad_Kernel,1,&scalar);CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #else for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } printf("Function Rate (MB/s) \n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%s%11.4f \n", label[j], 1.0E-06 * bytes[j]/mintime[j]); } /* printf(HLINE);*/ #if WITH_PTHREADS ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);CHKERRQ(ierr); #endif /* --- Check Results --- */ checkSTREAMresults(); /* printf(HLINE);*/ PetscFinalize(); return 0; }
int main() { int quantum, checktick(); int BytesPerWord; register int j, k; double scalar, t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ printf(HLINE); BytesPerWord = sizeof(double); printf("This system uses %d bytes per DOUBLE PRECISION word.\n", BytesPerWord); printf(HLINE); printf("Array size = %d, Offset = %d\n" , N, OFFSET); printf("Total memory required = %.1f MB.\n", (3.0 * BytesPerWord) * ( (double) N / 1048576.0)); printf("Each test is run %d times, but only\n", NTIMES); printf("the *best* time for each is used.\n"); #ifdef _OPENMP printf(HLINE); #pragma omp parallel private(k) { // k = omp_get_num_threads(); // printf ("Number of Threads requested = %i\n",k); } #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<N; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } printf(HLINE); if ( (quantum = checktick()) >= 1) { // printf("Your clock granularity/precision appears to be " // "%d microseconds.\n", quantum); } else { // printf("Your clock granularity appears to be " // "less than one microsecond.\n"); } t = mysecond(); #pragma omp parallel for for (j = 0; j < N; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); // printf("Each test below will take on the order" // " of %d microseconds.\n", (int) t ); // printf(" (= %d clock ticks)\n", (int) (t/quantum) ); printf("Increase the size of the arrays if this shows that\n"); printf("you are not getting at least 20 clock ticks per test.\n"); printf(HLINE); printf("WARNING -- The above is only a rough guideline.\n"); printf("For best results, please be sure you know the\n"); printf("precision of your system timer.\n"); printf(HLINE); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<N; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<N; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<N; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<N; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } printf("Function Rate (MB/s) Avg time Min time Max time\n"); for (j=0; j<4; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); /* printf("%s%11.4f %11.4f %11.4f %11.4f\n", label[j], 1.0E-06 * bytes[j]/mintime[j], avgtime[j], mintime[j], maxtime[j]);*/ } printf(HLINE); /* --- Check Results --- */ checkSTREAMresults(); printf(HLINE); return 0; }
void end_finish_spmd_timer() { finish_spmd_duration += (((double)(mysecond()-finish_spmd_start))/1000000) * 1000; //msec }
void end_time() { secs = mysecond() - secs; }
int main(int argc,char** argv) { int myrank=0,nprocs=1; int latsize[4],localsize[4]; int netSize[16],netPos[16],netDim; int i,j,t,npIn,nsite; int Niter = QCD_NITER; QCDSpinor* pSrc; QCDSpinor* pDest; QCDMatrix* pGauge; QCDReal Enorm = QCD_ENORM; QCDReal Cks = QCD_CKS; QCDReal* pCorr; double tstart,tend,ttotal; char* pStr; int ItimeS,NtimeS,ics,ids,is,ie,ipet,it,Nconv,cnt; double CorrF,Diff,rr; unsigned long flops; double tt; latsize[0] = 0; latsize[1] = 0; latsize[2] = 0; latsize[3] = 0; netDim = 4; netSize[0] = 0; netSize[1] = 0; netSize[2] = 0; netSize[3] = 0; for(i=1;i<argc;i++){ if(argv[i][0] == 'L'){ t = 0; for(j=1;j<strlen(argv[i]);j++){ if(argv[i][j] == 'x'){ t++; } else if(argv[i][j] >= '0' && argv[i][j] <= '9'){ latsize[t] = 10*latsize[t] + (int)(argv[i][j] - '0'); } } } else if(argv[i][0] == 'P'){ t = 0; for(j=1;j<strlen(argv[i]);j++){ if(argv[i][j] == 'x'){ t++; } else if(argv[i][j] >= '0' && argv[i][j] <= '9'){ netSize[t] = 10*netSize[t] + (int)(argv[i][j] - '0'); } } } } t = 0; for(i=0;i<4;i++){ if(latsize[0] == 0){ t++; } } if(t > 0){ latsize[0] = QCD_NX; latsize[1] = QCD_NY; latsize[2] = QCD_NZ; latsize[3] = QCD_NT; } MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&nprocs); MPI_Comm_rank(MPI_COMM_WORLD,&myrank); npIn = 1; for(i=0;i<4;i++){ npIn *= netSize[i]; //debug /* printf("netSize[%d] == %d\n", i, netSize[i]); */ } if(npIn != nprocs){ if(myrank == 0){ printf("Number of processes is invalid\n"); } return 0; } nsite = 1; for(i=0;i<4;i++){ localsize[i] = latsize[i] / netSize[i]; nsite *= localsize[i]; } t = myrank; for(i=0;i<4;i++){ netPos[i] = t % netSize[i]; t /= netSize[i]; } QCDDopr_Init(localsize[0],localsize[1],localsize[2],localsize[3],netSize[0],netSize[1],netSize[2],netSize[3],myrank); if(myrank == 0){ printf("=============================================\n"); printf("QCD base MPI program\n"); printf(" Lattice size = %dx%dx%dx%d\n",latsize[0],latsize[1],latsize[2],latsize[3]); printf("Decomposed by %d procs : %dx%dx%dx%d\n",nprocs,netSize[0],netSize[1],netSize[2],netSize[3]); printf(" Local Lattice size = %dx%dx%dx%d\n",localsize[0],localsize[1],localsize[2],localsize[3]); printf("\n Cks = %f\n",Cks); printf("=============================================\n"); } //debug /* printf("xxx\n"); */ pGauge = (QCDMatrix*)malloc(sizeof(QCDMatrix) * 4 * nsite + 512); uinit((double*)pGauge,latsize[0],latsize[1],latsize[2],latsize[3]); //debug /* printf("xxx\n"); */ pSrc = (QCDSpinor*)malloc(sizeof(QCDSpinor) * nsite + 128); pDest = (QCDSpinor*)malloc(sizeof(QCDSpinor) * nsite + 128); pCorr = (QCDReal*)malloc(sizeof(QCDReal) * latsize[3]); for(i=0;i<latsize[3];i++){ pCorr[i] = 0.0; } ttotal = 0.0; /* for(ics=0;ics<QCD_NCOL;ics++){ */ /* for(ids=0;ids<QCD_ND;ids++){ */ for(ics=0;ics<1;ics++){ for(ids=0;ids<1;ids++){ set_src(ids,ics,pSrc,0); MPI_Barrier(MPI_COMM_WORLD); tstart = mysecond(); Solve_CG(pDest,pGauge,pSrc,Cks,Enorm,&Nconv,&Diff); MPI_Barrier(MPI_COMM_WORLD); tend = mysecond() - tstart; ttotal += tend; if(myrank == 0){ printf(" %3d %3d %6d %12.4e ... %f sec\n", ics, ids, Nconv, Diff,tend); } for(i=0;i<latsize[3];i++){ ipet = i/localsize[3]; it = i % localsize[3]; if(ipet == netPos[3]){ is = it*localsize[0]*localsize[1]*localsize[2]; QCDLA_Norm(&CorrF,pDest + is,localsize[0]*localsize[1]*localsize[2]); } else{ CorrF = 0.0; } MPI_Allreduce(&CorrF,&rr,1,MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD); pCorr[i] = pCorr[i] + rr; } } } if(myrank == 0){ printf("\nPs meson correlator:\n"); for(i=0;i<latsize[3];i++){ printf("%d: %0.8E\n",i,pCorr[i]); } printf("\n Avg. Solver Time = %f [sec]\n",ttotal / 12); } MPI_Barrier(MPI_COMM_WORLD); //debug /* printf("finish\n"); */ return 0; }
void start_time() { secs = mysecond(); }
int main() { int quantum, checktick(); int BytesPerWord; int k; ssize_t j; STREAM_TYPE scalar; double t, times[4][NTIMES]; /* --- SETUP --- determine precision and check timing --- */ BytesPerWord = sizeof(STREAM_TYPE); #ifdef _OPENMP #pragma omp parallel { #pragma omp master { k = omp_get_num_threads(); } } #endif #ifdef _OPENMP k = 0; #pragma omp parallel #pragma omp atomic k++; #endif /* Get initial value for system clock. */ #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) { a[j] = 1.0; b[j] = 2.0; c[j] = 0.0; } if ( (quantum = checktick()) >= 1) printf(""); else { quantum = 1; } t = mysecond(); #pragma omp parallel for for (j = 0; j < STREAM_ARRAY_SIZE; j++) a[j] = 2.0E0 * a[j]; t = 1.0E6 * (mysecond() - t); /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ scalar = 3.0; for (k=0; k<NTIMES; k++) { times[0][k] = mysecond(); #ifdef TUNED tuned_STREAM_Copy(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]; #endif times[0][k] = mysecond() - times[0][k]; times[1][k] = mysecond(); #ifdef TUNED tuned_STREAM_Scale(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) b[j] = scalar*c[j]; #endif times[1][k] = mysecond() - times[1][k]; times[2][k] = mysecond(); #ifdef TUNED tuned_STREAM_Add(); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) c[j] = a[j]+b[j]; #endif times[2][k] = mysecond() - times[2][k]; times[3][k] = mysecond(); #ifdef TUNED tuned_STREAM_Triad(scalar); #else #pragma omp parallel for for (j=0; j<STREAM_ARRAY_SIZE; j++) a[j] = b[j]+scalar*c[j]; #endif times[3][k] = mysecond() - times[3][k]; } /* --- SUMMARY --- */ for (k=1; k<NTIMES; k++) /* note -- skip first iteration */ { for (j=0; j<4; j++) { avgtime[j] = avgtime[j] + times[j][k]; mintime[j] = MIN(mintime[j], times[j][k]); maxtime[j] = MAX(maxtime[j], times[j][k]); } } for (j=0; j<1; j++) { avgtime[j] = avgtime[j]/(double)(NTIMES-1); printf("%11.6f\n", 1.0E-06 * bytes[j]/mintime[j]); } /* --- Check Results --- */ checkSTREAMresults(); return 0; }