int main( int argc, char **argv ) { initCommunication( &argc, &argv ); // make up a simple test int size = read_int( argc, argv, "-s", 8 ); int r = read_int( argc, argv, "-r", 2 ); int P; MPI_Comm_size( MPI_COMM_WORLD, &P ); initSizes( P, r, size ); if( getRank() == 0 ) { if( P > (1<<r) ) printf("Need more recursive steps for this many processors\n"); if( P > (size/(1<<r))*(size/(1<<r)+1)/2) printf("Need a bigger matrix/fewer recursive steps for this many processors\n"); printf("-s %d -r %d -n %d\n", size, r, P); } int sizeSq = getSizeSq(r,P); int sizeTri = getSizeTri(r,P); double *X = (double*) malloc( sizeSq*sizeof(double) ); srand48(getRank()); fill(X,sizeSq); double *A = (double*) malloc( sizeTri*sizeof(double) ); if( getRank() == 0 ) printf("Generating a symmetric positive definite test matrix\n"); initTimers(); MPI_Barrier( MPI_COMM_WORLD ); double st2 = read_timer(); syrk( A, X, size, P, r, 0. ); MPI_Barrier( MPI_COMM_WORLD ); double et2 = read_timer(); if( getRank() == 0 ) printf("Generation time: %f\n", et2-st2); initTimers(); free(X); for( int i = 0; i < sizeTri; i++ ) A[i] = -A[i]; if( getRank() == 0 ) printf("Starting benchmark\n"); MPI_Barrier( MPI_COMM_WORLD ); double startTime = read_timer(); chol( A, size, P, r ); MPI_Barrier( MPI_COMM_WORLD ); double endTime = read_timer(); if( getRank() == 0 ) printf("Time: %f Gflop/s %f\n", endTime-startTime, size*1.*size*size/3./(endTime-startTime)/1.e9); free(A); printCounters(size); MPI_Finalize(); }
//TODO: Optimize this. How about this? define a inner clss Counter and //use a hashmap<TaskType,Counters> to unify the counters update void StateMachine::updateCounters( TaskType taskType, Status oldStatus, Status newStatus) { if ((oldStatus == Status::STAGING || oldStatus == Status::STARTING) && newStatus == Status::RUNNING) { switch (taskType) { case TaskType::MON: monRunningNum++; monStagingNum--; break; case TaskType::OSD: osdRunningNum++; osdStagingNum--; break; case TaskType::RADOSGW: radosgwRunningNum++; radosgwStagingNum--; break; } } if (oldStatus == Status::STAGING && newStatus == Status::FAILED) { switch (taskType) { case TaskType::MON: monStagingNum--; break; case TaskType::OSD: osdStagingNum--; break; case TaskType::RADOSGW: radosgwStagingNum--; break; } } if (oldStatus == Status::RUNNING && newStatus == Status::FAILED) { switch (taskType) { case TaskType::MON: monRunningNum--; break; case TaskType::OSD: osdRunningNum--; break; case TaskType::RADOSGW: radosgwRunningNum--; break; } } printCounters(); }
int main( int argc, char **argv ) { int randomize = read_int( argc, argv, "--random", 0 ); initCommunicationAlt(&argc, &argv, randomize); initTimers(); int rank = getRank(); MatDescriptor desc; double *A, *B, *C; //if( rank == 0 ) { // decide what desc should be int size = read_int( argc, argv, "-s", -1 ); if( size == -1 ) { if( rank == 0 ) printf("Must specify the matrix size desired with -s\n"); MPI_Finalize(); exit(-1); } int bsReq = read_int( argc, argv, "-b", 1 ); int nrecReq = read_int( argc, argv, "-r", -1 ); int mem = read_int( argc, argv, "-m", 1536 ); // number of megabytes available char *pattern = read_string( argc, argv, "-p", NULL ); mem = read_int( argc, argv, "-k", mem*1024 ); // or kilobytes setAvailableMemory(mem*128); // convert kilobytes to doubles int log7nProcs = getLog7nProcs(); if( nrecReq == -1 ) { int rsize = size / MIN_STRASSEN; nrecReq = 0; while( rsize > 1 ) { nrecReq += 1; rsize /= 2; } nrecReq = max(log7nProcs, nrecReq); if( getRank() == 0 ) printf("Setting nrec=%d\n", nrecReq); } if( getRank() == 0 ) printf("Benchmarking size %d\n", size); desc.lda = size; desc.nrec = nrecReq; desc.bs = bsReq; desc.nprocr = 1; desc.nprocc = 1; while( log7nProcs >= 2 ) { log7nProcs -= 2; desc.nprocr *= SEVEN; desc.nprocc *= SEVEN; } if( log7nProcs == 1 ) desc.nprocr *= SEVEN; desc.nproc = desc.nprocr*desc.nprocc; desc.nproc_summa = getPFactor(); // allocate the initial matrices A = allocate( numEntriesPerProc(desc) ); B = allocate( numEntriesPerProc(desc) ); C = allocate( numEntriesPerProc(desc) ); // fill the matrices with random data fill( A, numEntriesPerProc(desc) ); fill( B, numEntriesPerProc(desc) ); MPI_Barrier( MPI_COMM_WORLD ); double startTime = read_timer(); multiply( A, B, C, desc, pattern ); MPI_Barrier( MPI_COMM_WORLD ); double endTime = read_timer(); deallocate( A, numEntriesPerProc(desc) ); deallocate( B, numEntriesPerProc(desc) ); printCounters(endTime-startTime, desc); MPI_Finalize(); }
void kmp_stats_output_module::outputStats(const char* heading) { statistic allStats[TIMER_LAST]; statistic allCounters[COUNTER_LAST]; // stop all the explicit timers for all threads windupExplicitTimers(); FILE * eventsOut; FILE * statsOut = outputFileName ? fopen (outputFileName, "a+") : stderr; if (eventPrintingEnabled()) { eventsOut = fopen(eventsFileName, "w+"); } if (!statsOut) statsOut = stderr; fprintf(statsOut, "%s\n",heading); // Accumulate across threads. kmp_stats_list::iterator it; for (it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) { int t = (*it)->getGtid(); // Output per thread stats if requested. if (perThreadPrintingEnabled()) { fprintf (statsOut, "Thread %d\n", t); printStats(statsOut, (*it)->getTimers(), true); printCounters(statsOut, (*it)->getCounters()); fprintf(statsOut,"\n"); } // Output per thread events if requested. if (eventPrintingEnabled()) { kmp_stats_event_vector events = (*it)->getEventVector(); printEvents(eventsOut, &events, t); } for (int s = 0; s<TIMER_LAST; s++) { // See if we should ignore this timer when aggregating if ((timeStat::masterOnly(timer_e(s)) && (t != 0)) || // Timer is only valid on the master and this thread is a worker (timeStat::workerOnly(timer_e(s)) && (t == 0)) || // Timer is only valid on a worker and this thread is the master timeStat::synthesized(timer_e(s)) // It's a synthesized stat, so there's no raw data for it. ) { continue; } statistic * threadStat = (*it)->getTimer(timer_e(s)); allStats[s] += *threadStat; } // Special handling for synthesized statistics. // These just have to be coded specially here for now. // At present we only have a few: // The total parallel work done in each thread. // The variance here makes it easy to see load imbalance over the whole program (though, of course, // it's possible to have a code with awful load balance in every parallel region but perfect load // balance oever the whole program.) // The time spent in barriers in each thread. allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal()); // Time in explicit barriers. allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal()); for (int c = 0; c<COUNTER_LAST; c++) { if (counter::masterOnly(counter_e(c)) && t != 0) continue; allCounters[c].addSample ((*it)->getCounter(counter_e(c))->getValue()); } } if (eventPrintingEnabled()) { printPloticusFile(); fclose(eventsOut); } fprintf (statsOut, "Aggregate for all threads\n"); printStats (statsOut, &allStats[0], true); fprintf (statsOut, "\n"); printStats (statsOut, &allCounters[0], false); if (statsOut != stderr) fclose(statsOut); }
void kmp_stats_output_module::outputStats(const char *heading) { // Stop all the explicit timers in all threads // Do this before declaring the local statistics because thay have // constructors so will take time to create. windupExplicitTimers(); statistic allStats[TIMER_LAST]; statistic totalStats[TIMER_LAST]; /* Synthesized, cross threads versions of normal timer stats */ statistic allCounters[COUNTER_LAST]; FILE *statsOut = !outputFileName.empty() ? fopen(outputFileName.c_str(), "a+") : stderr; if (!statsOut) statsOut = stderr; FILE *eventsOut; if (eventPrintingEnabled()) { eventsOut = fopen(eventsFileName, "w+"); } printHeaderInfo(statsOut); fprintf(statsOut, "%s\n", heading); // Accumulate across threads. kmp_stats_list::iterator it; for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) { int t = (*it)->getGtid(); // Output per thread stats if requested. if (printPerThreadFlag) { fprintf(statsOut, "Thread %d\n", t); printTimerStats(statsOut, (*it)->getTimers(), 0); printCounters(statsOut, (*it)->getCounters()); fprintf(statsOut, "\n"); } // Output per thread events if requested. if (eventPrintingEnabled()) { kmp_stats_event_vector events = (*it)->getEventVector(); printEvents(eventsOut, &events, t); } // Accumulate timers. for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) { // See if we should ignore this timer when aggregating if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on master // and this thread is worker (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker // and this thread is the master ) { continue; } statistic *threadStat = (*it)->getTimer(s); allStats[s] += *threadStat; // Add Total stats for timers that are valid in more than one thread if (!timeStat::noTotal(s)) totalStats[s].addSample(threadStat->getTotal()); } // Accumulate counters. for (counter_e c = counter_e(0); c < COUNTER_LAST; c = counter_e(c + 1)) { if (counter::masterOnly(c) && t != 0) continue; allCounters[c].addSample((*it)->getCounter(c)->getValue()); } } if (eventPrintingEnabled()) { printPloticusFile(); fclose(eventsOut); } fprintf(statsOut, "Aggregate for all threads\n"); printTimerStats(statsOut, &allStats[0], &totalStats[0]); fprintf(statsOut, "\n"); printCounterStats(statsOut, &allCounters[0]); if (statsOut != stderr) fclose(statsOut); }