示例#1
0
文件: pic.c 项目: ParRes/Kernels
int main(int argc, char ** argv) {

  int         args_used = 1;     // keeps track of # consumed arguments
  uint64_t    L;                 // dimension of grid in cells
  uint64_t    iterations;        // total number of simulation steps
  uint64_t    n;                 // total number of particles in the simulation
  char        *init_mode;        // particle initialization mode (char)
  uint64_t    particle_mode;     // particle initialization mode (int)
  double      rho;               // attenuation factor for geometric particle distribution
  int64_t     k, m;              // determine initial horizontal and vertical velocity of
                                 // particles-- (2*k)+1 cells per time step
  double      alpha, beta;       // slope and offset values for linear particle distribution
  bbox_t      grid_patch,        // whole grid
              init_patch;        // subset of grid used for localized initialization
  int         correctness = 1;   // determines whether simulation was correct
  double      *Qgrid;            // field of fixed charges
  particle_t  *particles, *p;    // the particles array
  uint64_t    iter, i;           // dummies
  double      fx, fy, ax, ay;    // forces and accelerations
#if UNUSED
  int         particles_per_cell;// number of particles per cell to be injected
  int         error=0;           // used for graceful exit after error
#endif
  double      avg_time, pic_time;// timing parameters
  int         nthread_input,     // thread parameters
              nthread;
  int         num_error=0;       // flag that signals that requested and obtained
                                 // numbers of threads are the same
  random_draw_t dice;

  printf("Parallel Research Kernels Version %s\n", PRKVERSION);
  printf("OpenMP Particle-in-Cell execution on 2D grid\n");

  /*******************************************************************************
  ** process and test input parameters
  ********************************************************************************/

  if (argc<7) {
    printf("Usage: %s <#threads> <#simulation steps> <grid size> <#particles> <k (particle charge semi-increment)> ", argv[0]);
    printf("<m (vertical particle velocity)>\n");
    printf("          <init mode> <init parameters>]\n");
    printf("   init mode \"GEOMETRIC\"  parameters: <attenuation factor>\n");
    printf("             \"SINUSOIDAL\" parameters: none\n");
    printf("             \"LINEAR\"     parameters: <negative slope> <constant offset>\n");
    printf("             \"PATCH\"      parameters: <xleft> <xright>  <ybottom> <ytop>\n");
    exit(SUCCESS);
  }

  /* Take number of threads to request from command line */
  nthread_input = atoi(*++argv);

  if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) {
    printf("ERROR: Invalid number of threads: %d\n", nthread_input);
    exit(EXIT_FAILURE);
  }

  omp_set_num_threads(nthread_input);

  iterations = atol(*++argv);  args_used++;
  if (iterations<1) {
    printf("ERROR: Number of time steps must be positive: %" PRIu64 "\n", iterations);
    exit(FAILURE);
  }
  L = atol(*++argv);  args_used++;
  if (L<1 || L%2) {
    printf("ERROR: Number of grid cells must be positive and even: %" PRIu64 "\n", L);
    exit(FAILURE);
  }

  grid_patch = (bbox_t){0, L+1, 0, L+1};
  n = atol(*++argv);  args_used++;
  if (n<1) {
    printf("ERROR: Number of particles must be positive: %" PRIu64 "\n", n);
    exit(FAILURE);
  }

  particle_mode  = UNDEFINED;
  k = atoi(*++argv);   args_used++;
  if (k<0) {
    printf("ERROR: Particle semi-charge must be non-negative: %" PRIu64 "\n", k);
    exit(FAILURE);
  }
  m = atoi(*++argv);   args_used++;
  init_mode = *++argv; args_used++;

  /* Initialize particles with geometric distribution */
  if (strcmp(init_mode, "GEOMETRIC") == 0) {
    if (argc<args_used+1) {
      printf("ERROR: Not enough arguments for GEOMETRIC\n");
      exit(FAILURE);
    }
    particle_mode = GEOMETRIC;
    rho = atof(*++argv);   args_used++;
  }

  /* Initialize with a sinusoidal particle distribution (single period) */
  if (strcmp(init_mode, "SINUSOIDAL") == 0) {
    particle_mode = SINUSOIDAL;
  }

  /* Initialize particles with linear distribution */
  /* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/
  if (strcmp(init_mode, "LINEAR") == 0) {
    if (argc<args_used+2) {
      printf("ERROR: Not enough arguments for LINEAR initialization\n");
      exit(EXIT_FAILURE);
    }
    particle_mode = LINEAR;
    alpha = atof(*++argv); args_used++;
    beta  = atof(*++argv); args_used++;
    if (beta <0 || beta<alpha) {
      printf("ERROR: linear profile gives negative particle density\n");
      exit(EXIT_FAILURE);
    }
  }

  /* Initialize particles uniformly within a "patch" */
  if (strcmp(init_mode, "PATCH") == 0) {
    if (argc<args_used+4) {
      printf("ERROR: Not enough arguments for PATCH initialization\n");
      exit(FAILURE);
    }
    particle_mode = PATCH;
    init_patch.left   = atoi(*++argv); args_used++;
    init_patch.right  = atoi(*++argv); args_used++;
    init_patch.bottom = atoi(*++argv); args_used++;
    init_patch.top    = atoi(*++argv); args_used++;
    if (bad_patch(&init_patch, &grid_patch)) {
      printf("ERROR: inconsistent initial patch\n");
      exit(FAILURE);
    }
  }

  #pragma omp parallel
  {

  #pragma omp master
  {
  nthread = omp_get_num_threads();

  if (nthread != nthread_input) {
    num_error = 1;
    printf("ERROR: number of requested threads %d does not equal ",
           nthread_input);
    printf("number of spawned threads %d\n", nthread);
  }
  else {
    printf("Number of threads              = %d\n",nthread_input);
    printf("Grid size                      = %lld\n", L);
    printf("Number of particles requested  = %lld\n", n);
    printf("Number of time steps           = %lld\n", iterations);
    printf("Initialization mode            = %s\n", init_mode);

    switch(particle_mode) {
    case GEOMETRIC: printf("  Attenuation factor           = %lf\n", rho);    break;
    case SINUSOIDAL:                                                          break;
    case LINEAR:    printf("  Negative slope               = %lf\n", alpha);
                    printf("  Offset                       = %lf\n", beta);   break;
    case PATCH:     printf("  Bounding box                 = %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n",
                           init_patch.left, init_patch.right,
                           init_patch.bottom, init_patch.top);                break;
    default:        printf("ERROR: Unsupported particle initializating mode\n");
                     exit(FAILURE);
    }
    printf("Particle charge semi-increment = %"PRIu64"\n", k);
    printf("Vertical velocity              = %"PRIu64"\n", m);

    /* Initialize grid of charges and particles */
    Qgrid = initializeGrid(L);

    LCG_init(&dice);
    switch(particle_mode) {
    case GEOMETRIC:  particles = initializeGeometric(n, L, rho, k, m, &n, &dice);      break;
    case SINUSOIDAL: particles = initializeSinusoidal(n, L, k, m, &n, &dice);          break;
    case LINEAR:     particles = initializeLinear(n, L, alpha, beta, k, m, &n, &dice); break;
    case PATCH:      particles = initializePatch(n, L, init_patch, k, m, &n, &dice);   break;
    default:         printf("ERROR: Unsupported particle distribution\n");  exit(FAILURE);
    }

    printf("Number of particles placed     = %lld\n", n);
  }
  }
  bail_out(num_error);
  }

  for (iter=0; iter<=iterations; iter++) {

    /* start the timer after one warm-up time step */
    if (iter==1) {
      pic_time = wtime();
    }

    /* Calculate forces on particles and update positions */
    #pragma omp parallel for private(i, p, fx, fy, ax, ay)
    for (i=0; i<n; i++) {
      p = particles;
      fx = 0.0;
      fy = 0.0;
      computeTotalForce(p[i], L, Qgrid, &fx, &fy);
      ax = fx * MASS_INV;
      ay = fy * MASS_INV;

      /* Update particle positions, taking into account periodic boundaries */
      p[i].x = fmod(p[i].x + p[i].v_x*DT + 0.5*ax*DT*DT + L, L);
      p[i].y = fmod(p[i].y + p[i].v_y*DT + 0.5*ay*DT*DT + L, L);

      /* Update velocities */
      p[i].v_x += ax * DT;
      p[i].v_y += ay * DT;
    }
  }

  pic_time = wtime() - pic_time;

  /* Run the verification test */
  for (i=0; i<n; i++) {
    correctness *= verifyParticle(particles[i], iterations, Qgrid, L);
  }

  if (correctness) {
    printf("Solution validates\n");
#ifdef VERBOSE
    printf("Simulation time is %lf seconds\n", pic_time);
#endif
    avg_time = n*iterations/pic_time;
    printf("Rate (Mparticles_moved/s): %lf\n", 1.0e-6*avg_time);
  } else {
    printf("Solution does not validate\n");
  }

  return(EXIT_SUCCESS);
}
示例#2
0
文件: pic.c 项目: afanfa/Kernels
int main(int argc, char ** argv) {
   
  int             Num_procs;         // number of ranks 
  int             Num_procsx, 
                  Num_procsy;        // number of ranks in each coord direction      
  int             args_used = 1;     // keeps track of # consumed arguments
  int             my_ID;             // MPI rank
  int             my_IDx, my_IDy;    // coordinates of rank in rank grid                  
  int             root = 0;          // master rank
  uint64_t        L;                 // dimension of grid in cells
  uint64_t        iterations ;       // total number of simulation steps
  uint64_t        n;                 // total number of particles requested in the simulation
  uint64_t        actual_particles,  // actual number of particles owned by my rank
                  total_particles;   // total number of generated particles
  char            *init_mode;        // particle initialization mode (char)
  double          rho ;              // attenuation factor for geometric particle distribution
  uint64_t        k, m;              // determine initial horizontal and vertical velocity of 
                                 // particles-- (2*k)+1 cells per time step 
  double          *grid;             // the grid is represented as an array of charges
  uint64_t        iter, i;           // dummies
  double          fx, fy, ax, ay;    // particle forces and accelerations
  int             error=0;           // used for graceful exit after error
  uint64_t        correctness=0;     // boolean indicating correct particle displacements
  uint64_t        istart, jstart, iend, jend, particles_size, particles_count;
  bbox_t          grid_patch,        // whole grid
                  init_patch,        // subset of grid used for localized initialization
                  my_tile;           // subset of grid owner by my rank
  particle_t      *particles, *p;    // array of particles owned by my rank
  uint64_t        *cur_counts;       //
  uint64_t        ptr_my;            //
  uint64_t        owner;             // owner (rank) of a particular particle
  double          pic_time, local_pic_time, avg_time;
  uint64_t        my_checksum = 0, tot_checksum = 0, correctness_checksum = 0;
  uint64_t        width, height;     // minimum dimensions of grid tile owned by my rank
  int             particle_mode;     // type of initialization
  double          alpha, beta;       // negative slope and offset for linear initialization
  int             nbr[8];            // topological neighbor ranks
  int             icrit, jcrit;      // global grid indices where tile size drops to minimum
  find_owner_type find_owner;
  int             ileftover, jleftover;// excess grid points divided among "fat" tiles
  uint64_t        to_send[8], to_recv[8];// 
  int             procsize;          // number of ranks per OS process                      
   
  MPI_Status  status[16];
  MPI_Request requests[16];
   
  /* Initialize the MPI environment */
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  /* FIXME: This can be further improved */
  /* Create MPI data type for particle_t */
  MPI_Datatype PARTICLE;
  MPI_Type_contiguous(sizeof(particle_t)/sizeof(double), MPI_DOUBLE, &PARTICLE);
  MPI_Type_commit( &PARTICLE );

  if (my_ID==root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("FG_MPI Particle-in-Cell execution on 2D grid\n");

    if (argc<6) {
      printf("Usage: %s <#simulation steps> <grid size> <#particles> <k (particle charge semi-increment)> ", argv[0]);
      printf("<m (vertical particle velocity)>\n");
      printf("          <init mode> <init parameters>]\n");
      printf("   init mode \"GEOMETRIC\"  parameters: <attenuation factor>\n");
      printf("             \"SINUSOIDAL\" parameters: none\n");
      printf("             \"LINEAR\"     parameters: <negative slope> <constant offset>\n");
      printf("             \"PATCH\"      parameters: <xleft> <xright>  <ybottom> <ytop>\n");
      error = 1;
      goto ENDOFTESTS;
    }

    iterations = atol(*++argv);  args_used++;   
    if (iterations<1) {
      printf("ERROR: Number of time steps must be positive: %llu\n", iterations);
      error = 1;
      goto ENDOFTESTS;  
    }

    L = atol(*++argv);  args_used++;   
    if (L<1 || L%2) {
      printf("ERROR: Number of grid cells must be positive and even: %llu\n", L);
      error = 1;
      goto ENDOFTESTS;  
    }
    n = atol(*++argv);  args_used++;   
    if (n<1) {
      printf("ERROR: Number of particles must be positive: %llu\n", n);
      error = 1;
      goto ENDOFTESTS;  
    }

    particle_mode  = UNDEFINED;
    k = atoi(*++argv);   args_used++; 
    if (k<0) {
      printf("ERROR: Particle semi-charge must be non-negative: %llu\n", k);
      error = 1;
      goto ENDOFTESTS;  
    }
    m = atoi(*++argv);   args_used++; 
    init_mode = *++argv; args_used++;  

    ENDOFTESTS:;  

  } // done with standard initialization parameters
  bail_out(error);

  MPI_Bcast(&iterations, 1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&L,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&n,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&k,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);
  MPI_Bcast(&m,          1, MPI_UINT64_T, root, MPI_COMM_WORLD);

  grid_patch = (bbox_t){0, L+1, 0, L+1};
   
  if (my_ID==root) { // process initialization parameters
    /* Initialize particles with geometric distribution */
    if (strcmp(init_mode, "GEOMETRIC") == 0) {
      if (argc<args_used+1) {
        printf("ERROR: Not enough arguments for GEOMETRIC\n"); 
        error = 1;
        goto ENDOFTESTS2;  
      }
      particle_mode = GEOMETRIC;
      rho = atof(*++argv);   args_used++;
    }
   
    /* Initialize with a sinusoidal particle distribution (single period) */
    if (strcmp(init_mode, "SINUSOIDAL") == 0) {
      particle_mode = SINUSOIDAL;
    }
   
    /* Initialize particles with linear distribution */
    /* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/
    if (strcmp(init_mode, "LINEAR") == 0) {
      if (argc<args_used+2) {
        printf("ERROR: Not enough arguments for LINEAR initialization\n");
        error = 1;
        goto ENDOFTESTS2;  
        exit(EXIT_FAILURE);
      }
      particle_mode = LINEAR;
      alpha = atof(*++argv); args_used++; 
      beta  = atof(*++argv); args_used++;
      if (beta <0 || beta<alpha) {
        printf("ERROR: linear profile gives negative particle density\n");
        error = 1;
        goto ENDOFTESTS2;  
      }
    }
   
    /* Initialize particles uniformly within a "patch" */
    if (strcmp(init_mode, "PATCH") == 0) {
      if (argc<args_used+4) {
        printf("ERROR: Not enough arguments for PATCH initialization\n");
        error = 1;
        goto ENDOFTESTS2;  
      }
      particle_mode = PATCH;
      init_patch.left   = atoi(*++argv); args_used++;
      init_patch.right  = atoi(*++argv); args_used++;
      init_patch.bottom = atoi(*++argv); args_used++;
      init_patch.top    = atoi(*++argv); args_used++;
      if (bad_patch(&init_patch, &grid_patch)) {
        printf("ERROR: inconsistent initial patch\n");
        error = 1;
        goto ENDOFTESTS2;  
      }
    }
    ENDOFTESTS2:;  

  } //done with processing initializaton parameters, now broadcast

  bail_out(error);

  MPI_Bcast(&particle_mode, 1, MPI_INT, root, MPI_COMM_WORLD);
  switch (particle_mode) {
  case GEOMETRIC:  MPI_Bcast(&rho,               1, MPI_DOUBLE,  root, MPI_COMM_WORLD);
                   break;
  case SINUSOIDAL: break;
  case LINEAR:     MPI_Bcast(&alpha,             1, MPI_DOUBLE,  root, MPI_COMM_WORLD);
                   MPI_Bcast(&beta,              1, MPI_DOUBLE,  root, MPI_COMM_WORLD);
                   break;
  case PATCH:      MPI_Bcast(&init_patch.left,   1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   MPI_Bcast(&init_patch.right,  1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   MPI_Bcast(&init_patch.bottom, 1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   MPI_Bcast(&init_patch.top,    1, MPI_INT64_T, root, MPI_COMM_WORLD);
                   break;
  }
   
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now                        */

  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }      
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;

  if (my_ID == root) {
    MPIX_Get_collocated_size(&procsize);
    printf("Number of ranks                    = %llu\n", Num_procs);
    printf("Number of ranks/process            = %d\n", procsize);
    printf("Load balancing                     = None\n");
    printf("Grid size                          = %llu\n", L);
    printf("Tiles in x/y-direction             = %d/%d\n", Num_procsx, Num_procsy);
    printf("Number of particles requested      = %llu\n", n); 
    printf("Number of time steps               = %llu\n", iterations);
    printf("Initialization mode                = %s\n",   init_mode);
    switch(particle_mode) {
    case GEOMETRIC: printf("  Attenuation factor               = %lf\n", rho);    break;
    case SINUSOIDAL:                                                              break;
    case LINEAR:    printf("  Negative slope                   = %lf\n", alpha);
                    printf("  Offset                           = %lf\n", beta);   break;
    case PATCH:     printf("  Bounding box                     = %llu, %llu, %llu, %llu\n",
                           init_patch.left, init_patch.right, 
                           init_patch.bottom, init_patch.top);                    break;
    default:        printf("ERROR: Unsupported particle initializating mode\n");
                    error = 1;
    }
    printf("Particle charge semi-increment (k) = %llu\n", k);
    printf("Vertical velocity              (m) = %llu\n", m);
  }
  bail_out(error);

  /* The processes collectively create the underlying grid following a 2D block decomposition;
     unlike in the stencil code, successive blocks share an overlap vertex                   */
  width = L/Num_procsx;
  if (width < 2*k) {
    if (my_ID==0) printf("k-value too large: %llu, must be no greater than %llu\n", k, width/2);
    bail_out(1);
  }
  ileftover = L%Num_procsx;
  if (my_IDx<ileftover) {
    istart = (width+1) * my_IDx; 
    iend = istart + width + 1;
  }
  else {
    istart = (width+1) * ileftover + width * (my_IDx-ileftover);
    iend = istart + width;
  }
  icrit = (width+1) * ileftover;

  height = L/Num_procsy;
  if (height < m) {
    if (my_ID==0) printf("m-value too large: %llu, must be no greater than %llu\n", m, height);
    bail_out(1);
  }

  jleftover = L%Num_procsy;
  if (my_IDy<jleftover) {
    jstart = (height+1) * my_IDy; 
    jend = jstart + height + 1;
  }
  else {
    jstart = (height+1) * jleftover + height * (my_IDy-jleftover);
    jend = jstart + height;
  }
  jcrit = (height+1) * jleftover;

  /* if the problem can be divided evenly among ranks, use the simple owner finding function */
  if (icrit==0 && jcrit==0) {
    find_owner = find_owner_simple;
    if (my_ID==root) printf("Rank search mode used              = simple\n");
  }
  else {
    find_owner = find_owner_general;
    if (my_ID==root) printf("Rank search mode used              = general\n");
  }

  /* define bounding box for tile owned by my rank for convenience */
  my_tile = (bbox_t){istart,iend,jstart,jend};

  /* Find neighbors. Indexing: left=0, right=1, bottom=2, top=3, 
                               bottom-left=4, bottom-right=5, top-left=6, top-right=7 */

  /* These are IDs in the global communicator */
  nbr[0] = (my_IDx == 0           ) ? my_ID  + Num_procsx - 1         : my_ID  - 1;
  nbr[1] = (my_IDx == Num_procsx-1) ? my_ID  - Num_procsx + 1         : my_ID  + 1;
  nbr[2] = (my_IDy == Num_procsy-1) ? my_ID  + Num_procsx - Num_procs : my_ID  + Num_procsx;
  nbr[3] = (my_IDy == 0           ) ? my_ID  - Num_procsx + Num_procs : my_ID  - Num_procsx;
  nbr[4] = (my_IDy == Num_procsy-1) ? nbr[0] + Num_procsx - Num_procs : nbr[0] + Num_procsx;
  nbr[5] = (my_IDy == Num_procsy-1) ? nbr[1] + Num_procsx - Num_procs : nbr[1] + Num_procsx;
  nbr[6] = (my_IDy == 0           ) ? nbr[0] - Num_procsx + Num_procs : nbr[0] - Num_procsx;
  nbr[7] = (my_IDy == 0           ) ? nbr[1] - Num_procsx + Num_procs : nbr[1] - Num_procsx;

  grid = initializeGrid(my_tile);

  switch(particle_mode){
  case GEOMETRIC: 
    particles = initializeGeometric(n, L, rho, my_tile, k, m,
                                     &particles_count, &particles_size);
    break;
  case LINEAR:
    particles = initializeLinear(n, L, alpha, beta, my_tile, k, m, 
                                     &particles_count, &particles_size);
    break;
  case SINUSOIDAL:
    particles = initializeSinusoidal(n, L, my_tile, k, m, 
                                     &particles_count, &particles_size);
    break;
  case PATCH:
    particles = initializePatch(n, L, init_patch, my_tile, k, m,
                                     &particles_count, &particles_size);
  }

  if (!particles) {
    printf("ERROR: Rank %d could not allocate space for %llu particles\n", my_ID, particles_size);
    error=1;
  }
  bail_out(error);

#if VERBOSE
  for (i=0; i<Num_procs; i++) {
    MPI_Barrier(MPI_COMM_WORLD);
    if (i == my_ID)  printf("Rank %d has %llu particles\n", my_ID, particles_count);
  }
#endif
  if (my_ID==root) {
    MPI_Reduce(&particles_count, &total_particles, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);
    printf("Number of particles placed         = %llu\n", total_particles);
  }
  else {
    MPI_Reduce(&particles_count, &total_particles, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);
  }

  /* Allocate space for communication buffers. Adjust appropriately as the simulation proceeds */
  
  uint64_t sendbuf_size[8], recvbuf_size[8];
  particle_t *sendbuf[8], *recvbuf[8];
  error=0;
  for (i=0; i<8; i++) {
    sendbuf_size[i] = MAX(1,n/(MEMORYSLACK*Num_procs));
    recvbuf_size[i] = MAX(1,n/(MEMORYSLACK*Num_procs));
    sendbuf[i] = (particle_t*) prk_malloc(sendbuf_size[i] * sizeof(particle_t));
    recvbuf[i] = (particle_t*) prk_malloc(recvbuf_size[i] * sizeof(particle_t));
    if (!sendbuf[i] || !recvbuf[i]) error++;
  }
  if (error) printf("Rank %d could not allocate communication buffers\n", my_ID);
  bail_out(error);
    
  /* Run the simulation */
  for (iter=0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_pic_time = wtime();
    }

    ptr_my = 0;
    for (i=0; i<8; i++) to_send[i]=0;
      
    /* Process own particles */
    p = particles;

    for (i=0; i < particles_count; i++) {
      fx = 0.0;
      fy = 0.0;
      computeTotalForce(p[i], my_tile, grid, &fx, &fy);

      ax = fx * MASS_INV;
      ay = fy * MASS_INV;

      /* Update particle positions, taking into account periodic boundaries */
      p[i].x = fmod(p[i].x + p[i].v_x*DT + 0.5*ax*DT*DT + L, L);
      p[i].y = fmod(p[i].y + p[i].v_y*DT + 0.5*ay*DT*DT + L, L);

      /* Update velocities */
      p[i].v_x += ax * DT;
      p[i].v_y += ay * DT;

      /* Check if particle stayed in same subdomain or moved to another */
      owner = find_owner(p[i], width, height, Num_procsx, icrit, jcrit, ileftover, jleftover);
      if (owner==my_ID) {
        add_particle_to_buffer(p[i], &p, &ptr_my, &particles_size);
      /* Add particle to the appropriate communication buffer */
      } else if (owner == nbr[0]) {
        add_particle_to_buffer(p[i], &sendbuf[0], &to_send[0], &sendbuf_size[0]);
      } else if (owner == nbr[1]) {
        add_particle_to_buffer(p[i], &sendbuf[1], &to_send[1], &sendbuf_size[1]);
      } else if (owner == nbr[2]) {
        add_particle_to_buffer(p[i], &sendbuf[2], &to_send[2], &sendbuf_size[2]);
      } else if (owner == nbr[3]) {
        add_particle_to_buffer(p[i], &sendbuf[3], &to_send[3], &sendbuf_size[3]);
      } else if (owner == nbr[4]) {
        add_particle_to_buffer(p[i], &sendbuf[4], &to_send[4], &sendbuf_size[4]);
      } else if (owner == nbr[5]) {
        add_particle_to_buffer(p[i], &sendbuf[5], &to_send[5], &sendbuf_size[5]);
      } else if (owner == nbr[6]) {
        add_particle_to_buffer(p[i], &sendbuf[6], &to_send[6], &sendbuf_size[6]);
      } else if (owner == nbr[7]) {
        add_particle_to_buffer(p[i], &sendbuf[7], &to_send[7], &sendbuf_size[7]);
      } else {
        printf("Could not find neighbor owner of particle %llu in tile %llu\n", 
        i, owner);
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      }
    }

    /* Communicate the number of particles to be sent/received */
    for (i=0; i<8; i++) {
      MPI_Isend(&to_send[i], 1, MPI_UINT64_T, nbr[i], 0, MPI_COMM_WORLD, &requests[i]);
      MPI_Irecv(&to_recv[i], 1, MPI_UINT64_T, nbr[i], 0, MPI_COMM_WORLD, &requests[8+i]);
    }
    MPI_Waitall(16, requests, status);
      
    /* Resize receive buffers if need be */
    for (i=0; i<8; i++) {
      resize_buffer(&recvbuf[i], &recvbuf_size[i], to_recv[i]);
    }
      
    /* Communicate the particles */
    for (i=0; i<8; i++) {
      MPI_Isend(sendbuf[i], to_send[i], PARTICLE, nbr[i], 0, MPI_COMM_WORLD, &requests[i]);
      MPI_Irecv(recvbuf[i], to_recv[i], PARTICLE, nbr[i], 0, MPI_COMM_WORLD, &requests[8+i]);
    }
    MPI_Waitall(16, requests, status);
     
    /* Attach received particles to particles buffer */
    for (i=0; i<4; i++) {
      attach_received_particles(&particles, &ptr_my, &particles_size, recvbuf[2*i], to_recv[2*i], 
                                recvbuf[2*i+1], to_recv[2*i+1]);
    }    
    particles_count = ptr_my;
  }
   
  local_pic_time = MPI_Wtime() - local_pic_time;
  MPI_Reduce(&local_pic_time, &pic_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);
   
  /* Run the verification test */
  /* First verify own particles */
  for (i=0; i < particles_count; i++) {
    correctness += verifyParticle(particles[i], (double)L, iterations);
    my_checksum += (uint64_t)particles[i].ID;
  }

  /* Gather total checksum of particles */
  MPI_Reduce(&my_checksum, &tot_checksum, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);
  /* Gather total checksum of correctness flags */
  MPI_Reduce(&correctness, &correctness_checksum, 1, MPI_UINT64_T, MPI_SUM, root, MPI_COMM_WORLD);

  if ( my_ID == root) {
    if (correctness_checksum != total_particles ) {
      printf("ERROR: there are %llu miscalculated locations\n", total_particles-correctness_checksum);
    }
    else {
      if (tot_checksum != (total_particles*(total_particles+1))/2) {
        printf("ERROR: Particle checksum incorrect\n");
      }
      else {
        avg_time = total_particles*iterations/pic_time;
        printf("Solution validates\n");
        printf("Rate (Mparticles_moved/s): %lf\n", 1.0e-6*avg_time);
      }
    }
  }
   
#if VERBOSE
  for (i=0; i<Num_procs; i++) {
    MPI_Barrier(MPI_COMM_WORLD);
    if (i == my_ID)  printf("Rank %d has %llu particles\n", my_ID, particles_count);
  }
#endif

  MPI_Finalize();
   
  return 0;
}