Esempio n. 1
0
int main( int argc, char **argv ) {

  initCommunication( &argc, &argv );
  
  // make up a simple test
  int size = read_int( argc, argv, "-s", 8 );
  int r = read_int( argc, argv, "-r", 2 );
  int P;
  MPI_Comm_size( MPI_COMM_WORLD, &P );
  initSizes( P, r, size );
  if( getRank() == 0 ) {
    if( P > (1<<r) )
      printf("Need more recursive steps for this many processors\n");
    if( P > (size/(1<<r))*(size/(1<<r)+1)/2)
      printf("Need a bigger matrix/fewer recursive steps for this many processors\n");
    printf("-s %d -r %d -n %d\n", size, r, P);
  }
  int sizeSq = getSizeSq(r,P);
  int sizeTri = getSizeTri(r,P);
  double *X = (double*) malloc( sizeSq*sizeof(double) );
  srand48(getRank());
  fill(X,sizeSq);
  double *A = (double*) malloc( sizeTri*sizeof(double) );
  if( getRank() == 0 )
    printf("Generating a symmetric positive definite test matrix\n");
  initTimers();
  MPI_Barrier( MPI_COMM_WORLD );
  double st2 = read_timer();
  syrk( A, X, size, P, r, 0. );
  MPI_Barrier( MPI_COMM_WORLD );
  double et2 = read_timer();
  if( getRank() == 0 )
    printf("Generation time: %f\n", et2-st2);
  initTimers();
  free(X);
  for( int i = 0; i < sizeTri; i++ )
    A[i] = -A[i];

  if( getRank() == 0 )
    printf("Starting benchmark\n");
  MPI_Barrier( MPI_COMM_WORLD );
  double startTime = read_timer();
  chol( A, size, P, r );
  MPI_Barrier( MPI_COMM_WORLD );
  double endTime = read_timer();
  
  if( getRank() == 0 )
    printf("Time: %f Gflop/s %f\n", endTime-startTime, size*1.*size*size/3./(endTime-startTime)/1.e9);

  free(A);
  printCounters(size);
  MPI_Finalize();
}
Esempio n. 2
0
//TODO: Optimize this. How about this? define a inner clss Counter and
//use a hashmap<TaskType,Counters> to unify the counters update
void StateMachine::updateCounters(
    TaskType taskType,
    Status oldStatus,
    Status newStatus)
{
  if ((oldStatus == Status::STAGING || oldStatus == Status::STARTING)
       && newStatus == Status::RUNNING) {
    switch (taskType) {
      case TaskType::MON:
        monRunningNum++;
        monStagingNum--;
        break;
      case TaskType::OSD:
        osdRunningNum++;
        osdStagingNum--;
        break;
      case TaskType::RADOSGW:
        radosgwRunningNum++;
        radosgwStagingNum--;
      break;
    }
  }

  if (oldStatus == Status::STAGING && newStatus == Status::FAILED) {
    switch (taskType) {
      case TaskType::MON:
        monStagingNum--;
        break;
      case TaskType::OSD:
        osdStagingNum--;
        break;
      case TaskType::RADOSGW:
        radosgwStagingNum--;
      break;
    }
  }
  if (oldStatus == Status::RUNNING && newStatus == Status::FAILED) {
    switch (taskType) {
      case TaskType::MON:
        monRunningNum--;
        break;
      case TaskType::OSD:
        osdRunningNum--;
        break;
      case TaskType::RADOSGW:
        radosgwRunningNum--;
      break;
    }
  }
  printCounters();
}
Esempio n. 3
0
int main( int argc, char **argv ) {
  int randomize = read_int( argc, argv, "--random", 0 );
  initCommunicationAlt(&argc, &argv, randomize);
  initTimers();

  int rank = getRank();

  MatDescriptor desc;
  double *A, *B, *C;

  //if( rank == 0 ) { // decide what desc should be
  int size = read_int( argc, argv, "-s", -1 );
  if( size == -1 ) {
    if( rank == 0 )
      printf("Must specify the matrix size desired with -s\n");
    MPI_Finalize();
    exit(-1);      
  }
  
  int bsReq = read_int( argc, argv, "-b", 1 );
  int nrecReq = read_int( argc, argv, "-r", -1 );
  int mem = read_int( argc, argv, "-m", 1536 ); // number of megabytes available
  char *pattern = read_string( argc, argv, "-p", NULL );
  mem = read_int( argc, argv, "-k", mem*1024 ); // or kilobytes
  setAvailableMemory(mem*128); // convert kilobytes to doubles
  
  int log7nProcs = getLog7nProcs();
  if( nrecReq == -1 ) {
    int rsize = size / MIN_STRASSEN;
    nrecReq = 0;
    while( rsize > 1 ) {
      nrecReq += 1;
      rsize /= 2;
    }
    nrecReq = max(log7nProcs, nrecReq);
    if( getRank() == 0 )
      printf("Setting nrec=%d\n", nrecReq);
  }
  if( getRank() == 0 )
    printf("Benchmarking size %d\n", size);
  desc.lda = size;
  desc.nrec = nrecReq;
  desc.bs = bsReq;
  desc.nprocr = 1;
  desc.nprocc = 1;
  while( log7nProcs >= 2 ) {
    log7nProcs -= 2;
    desc.nprocr *= SEVEN;
    desc.nprocc *= SEVEN;
  }
  if( log7nProcs == 1 )
    desc.nprocr *= SEVEN;
  desc.nproc = desc.nprocr*desc.nprocc;
  desc.nproc_summa = getPFactor();

  // allocate the initial matrices
  A = allocate( numEntriesPerProc(desc) );
  B = allocate( numEntriesPerProc(desc) );
  C = allocate( numEntriesPerProc(desc) );

  // fill the matrices with random data
  fill( A, numEntriesPerProc(desc) );
  fill( B, numEntriesPerProc(desc) );


  MPI_Barrier( MPI_COMM_WORLD );
  double startTime = read_timer();
  multiply( A, B, C, desc, pattern );
  MPI_Barrier( MPI_COMM_WORLD );
  double endTime = read_timer();

  deallocate( A, numEntriesPerProc(desc) );
  deallocate( B, numEntriesPerProc(desc) );

  printCounters(endTime-startTime, desc);
  MPI_Finalize();
}
Esempio n. 4
0
void kmp_stats_output_module::outputStats(const char* heading) 
{
    statistic allStats[TIMER_LAST];
    statistic allCounters[COUNTER_LAST];

    // stop all the explicit timers for all threads
    windupExplicitTimers();

    FILE * eventsOut;
    FILE * statsOut = outputFileName ? fopen (outputFileName, "a+") : stderr;

    if (eventPrintingEnabled()) {
        eventsOut = fopen(eventsFileName, "w+");
    }

    if (!statsOut)
        statsOut = stderr;

    fprintf(statsOut, "%s\n",heading);
    // Accumulate across threads.
    kmp_stats_list::iterator it;
    for (it = __kmp_stats_list.begin(); it != __kmp_stats_list.end(); it++) {
        int t = (*it)->getGtid();
        // Output per thread stats if requested.
        if (perThreadPrintingEnabled()) {
            fprintf (statsOut, "Thread %d\n", t);
            printStats(statsOut, (*it)->getTimers(), true);
            printCounters(statsOut, (*it)->getCounters());
            fprintf(statsOut,"\n");
        }
        // Output per thread events if requested.
        if (eventPrintingEnabled()) {
            kmp_stats_event_vector events = (*it)->getEventVector();
            printEvents(eventsOut, &events, t);
        }

        for (int s = 0; s<TIMER_LAST; s++) {
            // See if we should ignore this timer when aggregating
            if ((timeStat::masterOnly(timer_e(s)) && (t != 0)) || // Timer is only valid on the master and this thread is a worker
                (timeStat::workerOnly(timer_e(s)) && (t == 0)) || // Timer is only valid on a worker and this thread is the master
                timeStat::synthesized(timer_e(s))                 // It's a synthesized stat, so there's no raw data for it.
               )            
            {
                continue;
            }

            statistic * threadStat = (*it)->getTimer(timer_e(s));
            allStats[s] += *threadStat;
        }

        // Special handling for synthesized statistics.
        // These just have to be coded specially here for now. 
        // At present we only have a few: 
        // The total parallel work done in each thread.
        // The variance here makes it easy to see load imbalance over the whole program (though, of course,
        // it's possible to have a code with awful load balance in every parallel region but perfect load
        // balance oever the whole program.)
        // The time spent in barriers in each thread.
        allStats[TIMER_Total_work].addSample ((*it)->getTimer(TIMER_OMP_work)->getTotal());

        // Time in explicit barriers.
        allStats[TIMER_Total_barrier].addSample ((*it)->getTimer(TIMER_OMP_barrier)->getTotal());

        for (int c = 0; c<COUNTER_LAST; c++) {
            if (counter::masterOnly(counter_e(c)) && t != 0)
                continue;
            allCounters[c].addSample ((*it)->getCounter(counter_e(c))->getValue());
        }
    }

    if (eventPrintingEnabled()) {
        printPloticusFile();
        fclose(eventsOut);
    }

    fprintf (statsOut, "Aggregate for all threads\n");
    printStats (statsOut, &allStats[0], true);
    fprintf (statsOut, "\n");
    printStats (statsOut, &allCounters[0], false);

    if (statsOut != stderr)
        fclose(statsOut);

}
Esempio n. 5
0
void kmp_stats_output_module::outputStats(const char *heading) {
  // Stop all the explicit timers in all threads
  // Do this before declaring the local statistics because thay have
  // constructors so will take time to create.
  windupExplicitTimers();

  statistic allStats[TIMER_LAST];
  statistic totalStats[TIMER_LAST]; /* Synthesized, cross threads versions of
                                       normal timer stats */
  statistic allCounters[COUNTER_LAST];

  FILE *statsOut =
      !outputFileName.empty() ? fopen(outputFileName.c_str(), "a+") : stderr;
  if (!statsOut)
    statsOut = stderr;

  FILE *eventsOut;
  if (eventPrintingEnabled()) {
    eventsOut = fopen(eventsFileName, "w+");
  }

  printHeaderInfo(statsOut);
  fprintf(statsOut, "%s\n", heading);
  // Accumulate across threads.
  kmp_stats_list::iterator it;
  for (it = __kmp_stats_list->begin(); it != __kmp_stats_list->end(); it++) {
    int t = (*it)->getGtid();
    // Output per thread stats if requested.
    if (printPerThreadFlag) {
      fprintf(statsOut, "Thread %d\n", t);
      printTimerStats(statsOut, (*it)->getTimers(), 0);
      printCounters(statsOut, (*it)->getCounters());
      fprintf(statsOut, "\n");
    }
    // Output per thread events if requested.
    if (eventPrintingEnabled()) {
      kmp_stats_event_vector events = (*it)->getEventVector();
      printEvents(eventsOut, &events, t);
    }

    // Accumulate timers.
    for (timer_e s = timer_e(0); s < TIMER_LAST; s = timer_e(s + 1)) {
      // See if we should ignore this timer when aggregating
      if ((timeStat::masterOnly(s) && (t != 0)) || // Timer only valid on master
          // and this thread is worker
          (timeStat::workerOnly(s) && (t == 0)) // Timer only valid on worker
          // and this thread is the master
          ) {
        continue;
      }

      statistic *threadStat = (*it)->getTimer(s);
      allStats[s] += *threadStat;

      // Add Total stats for timers that are valid in more than one thread
      if (!timeStat::noTotal(s))
        totalStats[s].addSample(threadStat->getTotal());
    }

    // Accumulate counters.
    for (counter_e c = counter_e(0); c < COUNTER_LAST; c = counter_e(c + 1)) {
      if (counter::masterOnly(c) && t != 0)
        continue;
      allCounters[c].addSample((*it)->getCounter(c)->getValue());
    }
  }

  if (eventPrintingEnabled()) {
    printPloticusFile();
    fclose(eventsOut);
  }

  fprintf(statsOut, "Aggregate for all threads\n");
  printTimerStats(statsOut, &allStats[0], &totalStats[0]);
  fprintf(statsOut, "\n");
  printCounterStats(statsOut, &allCounters[0]);

  if (statsOut != stderr)
    fclose(statsOut);
}