예제 #1
0
파일: main.c 프로젝트: jons/life
int main (int argc, char **argv)
{
  int x, y, i = 0, k;
  grid_t a, b, *v, *w;

  init_grid(&a, SZ);
  init_grid(&b, SZ);

  // LIVE! LIVE, DAMN YOU
  for (x = 0; x < SZ; x++)
    for (y = 0; y < SZ; y++)
    {
      k = x * SZ + y;
      if (p[i] == k)
      {
        wcell(&a, x, y, 1);
        i++;
        if (i == PSZ) i = 0;
      }
    }

  // flip-flop next/current grid
  v = &a; w = &b; x = 0;
  for (;;) {
    draw(v);
    //
    if (v->i && !v->dt) break;
    //
    if (wait()) break;
    //
    next_grid(w, v);
    //
    if (x) { v = &a; w = &b; x = 0; }
    else   { v = &b; w = &a; x = 1; }
  }

  free_grid(&a);
  free_grid(&b);
  return 0;
}
예제 #2
0
//
//  benchmarking program
//
int main(int argc, char **argv) {
  if( find_option( argc, argv, "-h" ) >= 0 )
  {
    printf( "Options:\n" );
    printf( "-h to see this help\n" );
    printf( "-n <int> to set number of particles\n" );
    printf( "-o <filename> to specify the output file name\n" );
    printf( "-s <filename> to specify a summary file name\n" );
    printf( "-no turns off all correctness checks and particle output\n");
    printf( "-p <int> to set the (maximum) number of threads used\n");
    return 0;
  }

  const int n = read_int( argc, argv, "-n", 1000 );
  const bool fast = (find_option( argc, argv, "-no" ) != -1);
  const char *savename = read_string( argc, argv, "-o", NULL );
  const char *sumname = read_string( argc, argv, "-s", NULL );
  const int num_threads_override = read_int( argc, argv, "-p", 0);


  FILE *fsave = ((!fast) && savename) ? fopen( savename, "w" ) : NULL;
  FILE *fsum = sumname ? fopen ( sumname, "a" ) : NULL;

  const double size = set_size( n );
  // We need to set the size of a grid square so that the average number of
  // particles per grid square is constant.  The simulation already ensures
  // that the average number of particles in an arbitrary region is constant
  // and proportional to the area.  So this is just a constant.
  const double grid_square_size = sqrt(0.0005) + 0.000001;
  const int num_grid_squares_per_side = size / grid_square_size;
  printf("Using %d grid squares of side-length %f for %d particles.\n", num_grid_squares_per_side*num_grid_squares_per_side, grid_square_size, n);
  std::unique_ptr<std::vector<particle_t> > particles = init_particles(n);

  if (num_threads_override > 0) {
    omp_set_dynamic(0);   // fixed number of threads
    omp_set_num_threads(num_threads_override);  // assign number of threads
  }

  //
  //  simulate a number of time steps
  //
  double simulation_time = read_timer( );

  int max_num_threads = omp_get_max_threads();
  int num_actual_threads;

  // User-defined reductions aren't available in the version of OMP we're
  // using.  Instead, we accumulate per-thread stats in this global array
  // and reduce manually when we're done.
  Stats per_thread_stats[max_num_threads];

  // Shared across threads.
  std::unique_ptr<OmpThreadsafeGrid> old_grid(new OmpThreadsafeGrid(size, num_grid_squares_per_side));
  std::unique_ptr<OmpThreadsafeGrid> next_grid(new OmpThreadsafeGrid(size, num_grid_squares_per_side));


  #pragma omp parallel
  {
    #pragma omp atomic write
    num_actual_threads = omp_get_num_threads();   //get number of actual threads

    int thread_idx = omp_get_thread_num();    
    Stats thread_stats;
    for (int step = 0; step < 1000; step++) {
      // If this is the first step, we must initialize the grid here
      // without respecting cache locality.  Since we cannot use the existing
      // grid, we have to just divide the particles arbitrarily.  This
      // means that the subsequent code for simulating forces and movement
      // will have almost no cache locality on the first iteration: Each thread
      // has picked up an arbitrary subset of the particles to insert into the
      // grid, and then the threads are responsible for simulating a different,
      // mostly-disjoint subset of the particles.  On subsequent iterations,
      // only the particles that have moved will cause cache misses, so we
      // should have much better locality.  If we want to really optimize,
      // it may be worth rethinking how we store particles and communicate among
      // threads.  But at that point we might as well write distributed-memory
      // code.
      if (step == 0) {
        #pragma omp for
        for (int i = 0; i < n; i++) {
          next_grid->add((*particles)[i]);
        }
      }

      // Here we are building the grid that maps locations to sets of
      // particles.  This step does O(n) work, so it is a bottleneck if done
      // serially.  For performance comparisons, we have two versions of the
      // grid-formation code.  The second simply forms the grid serially, in a
      // single arbitrary thread.  The first is parallel and attempts
      // some cache locality.  Each thread is responsible for re-inserting
      // the grid elements that previously lay in its subgrid.  For that reason
      // we need to keep around the old grid while we are building the new one;
      // this is why we have old_grid and next_grid.

      // NOTE: We could instead re-insert each particle right after moving it.
      // This would be faster, but it would require us to think about
      // simultaneous parallel delete and add, while the current scheme needs
      // only support parallel add.  (Deleting the entire grid at once is an
      // O(1) operation, so we can do it in one thread with a barrier.)
      // (The actual simulation operations are read-only on the grid structure
      // and write to each particle only once, so we can simply use two
      // barriers to protect them.
      #pragma omp single
      {
        old_grid.swap(next_grid);
        next_grid.reset(new OmpThreadsafeGrid(size, num_grid_squares_per_side));
      }

      // Now insert each particle into the new grid.
      {
        std::unique_ptr<SimpleIterator<particle_t&> > particles_to_insert = old_grid->subgrid(thread_idx, num_actual_threads);
        while (particles_to_insert->hasNext()) {
          particle_t& p = particles_to_insert->next();
          next_grid->add(p);
        }
      }

      // Now we compute forces for particles.  Each thread handles its assigned
      // subgrid.  We first need a barrier to ensure that everyone sees all
      // the particles in next_grid.
      #pragma omp barrier

      {
        std::unique_ptr<SimpleIterator<particle_t&> > particles_to_force = next_grid->subgrid(thread_idx, num_actual_threads);
        while (particles_to_force->hasNext()) {
          particle_t& p = particles_to_force->next();
          p.ax = p.ay = 0;
          std::unique_ptr<SimpleIterator<particle_t&> > neighbors = next_grid->neighbor_iterator(p);
          while (neighbors->hasNext()) {
            particle_t& neighbor = neighbors->next();
            apply_force(p, neighbor, thread_stats);
          }
        }
      }

      // The barrier here ensures that no particle is moved before it is used
      // in apply_force above.
      #pragma omp barrier

      // Now we move each particle.
      std::unique_ptr<SimpleIterator<particle_t&> > particles_to_move = next_grid->subgrid(thread_idx, num_actual_threads);
      while (particles_to_move->hasNext()) {
        particle_t& p = particles_to_move->next();
        move(p);
      }

      // This barrier is probably unnecessary unless save() is going to happen.
      #pragma omp barrier

      if (!fast) {
        //
        //  save if necessary
        //
        #pragma omp master
        if( fsave && (step%SAVEFREQ) == 0 ) {
          save( fsave, n, (*particles).data() );
        }
      }

      // This barrier is probably unnecessary unless save() happened.
      #pragma omp barrier
    }

    #pragma omp critical
    per_thread_stats[thread_idx] = thread_stats;
  }
  simulation_time = read_timer( ) - simulation_time;

  // Could do a tree reduce here, but it seems unnecessary.
  Stats overall_stats;
  for (int thread_idx = 0; thread_idx < max_num_threads; thread_idx++) {
    overall_stats.aggregate_left(per_thread_stats[thread_idx]);
  }

  printf( "n = %d,threads = %d, simulation time = %g seconds", n,num_actual_threads, simulation_time);

  if (!fast) {
    //
    //  -the minimum distance absmin between 2 particles during the run of the simulation
    //  -A Correct simulation will have particles stay at greater than 0.4 (of cutoff) with typical values between .7-.8
    //  -A simulation were particles don't interact correctly will be less than 0.4 (of cutoff) with typical values between .01-.05
    //
    //  -The average distance absavg is ~.95 when most particles are interacting correctly and ~.66 when no particles are interacting
    //
    printf( ", absmin = %lf, absavg = %lf", overall_stats.min, overall_stats.avg);
    if (overall_stats.min < 0.4) printf ("\nThe minimum distance is below 0.4 meaning that some particle is not interacting");
    if (overall_stats.avg < 0.8) printf ("\nThe average distance is below 0.8 meaning that most particles are not interacting");
  }
  printf("\n");

  //
  // Printing summary data
  //
  if( fsum)
    fprintf(fsum,"%d %d %g\n",n,num_actual_threads, simulation_time);

  //
  // Clearing space
  //
  if( fsum )
    fclose( fsum );

  if( fsave )
    fclose( fsave );

  return 0;
}