示例#1
0
文件: pic.c 项目: ParRes/Kernels
/* Initialize uniformly particles within a "patch" */
particle_t *initializePatch(uint64_t n_input, uint64_t L, bbox_t patch,
                            double k, double m, uint64_t *n_placed,
                            random_draw_t *parm){
  particle_t  *particles;
  uint64_t    x, y, p, pi, total_cells, actual_particles;
  double      particles_per_cell;

  /* initialize random number generator */
  LCG_init(parm);

  /* first determine total number of particles, then allocate and place them   */
  total_cells  = (patch.right - patch.left+1)*(patch.top - patch.bottom+1);
  particles_per_cell = (double) n_input/total_cells;

  /* Iterate over the columns of cells and assign uniform number of particles */
  for ((*n_placed)=0,x=0; x<L; x++) {
    for (y=0; y<L; y++) {
      actual_particles = random_draw(particles_per_cell, parm);
      if (x<patch.left || x>patch.right || y<patch.bottom || y>patch.top)
        actual_particles = 0;
      (*n_placed) += actual_particles;
    }
  }

  particles = (particle_t*) prk_malloc((*n_placed) * sizeof(particle_t));
  if (particles == NULL) {
    printf("ERROR: Could not allocate space for particles\n");
    exit(EXIT_FAILURE);
  }

  /* Re-initialize random number generator */
  LCG_init(parm);

  /* Iterate over the columns of cells and assign uniform number of particles */
  for (pi=0,x=0; x<L; x++) {
    for (y=0; y<L; y++) {
      actual_particles = random_draw(particles_per_cell, parm);
      if (x<patch.left || x>patch.right || y<patch.bottom || y>patch.top)
        actual_particles = 0;
      for (p=0; p<actual_particles; p++,pi++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
      }
    }
  }
  finish_distribution((*n_placed), particles);

  return particles;
}
示例#2
0
文件: pic.c 项目: ParRes/Kernels
/* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/
particle_t *initializeLinear(uint64_t n_input, uint64_t L, double alpha, double beta,
                             double k, double m, uint64_t *n_placed, 
                             random_draw_t *parm){
  particle_t  *particles;
  uint64_t    x, y, p, pi, actual_particles;
  double      total_weight, step = 1.0/L, current_weight;

  /* initialize random number generator */
  LCG_init(parm);

  /* first determine total number of particles, then allocate and place them   */

  /* Find sum of all weights to normalize the number of particles */
  total_weight = beta*L-alpha*0.5*step*L*(L-1);

  /* Loop over columns of cells and assign number of particles proportional linear weight */
  for ((*n_placed)=0,x=0; x<L; x++) {
    current_weight = (beta - alpha * step * ((double) x));
    for (y=0; y<L; y++) {
      (*n_placed) += random_draw(n_input * (current_weight/total_weight)/L, parm);
    }
  }

  particles = (particle_t*) prk_malloc((*n_placed) * sizeof(particle_t));
  if (particles == NULL) {
    printf("ERROR: Could not allocate space for particles\n");
    exit(EXIT_FAILURE);
  }

  /* Re-initialize random number generator */
  LCG_init(parm);

  /* Loop over columns of cells and assign number of particles proportional linear weight */
  for (pi=0,x=0; x<L; x++) {
    current_weight = (beta - alpha * step * ((double) x));
    for (y=0; y<L; y++) {
      actual_particles = random_draw(n_input * (current_weight/total_weight)/L, parm);
      for (p=0; p<actual_particles; p++,pi++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
      }
    }
  }
  finish_distribution((*n_placed), particles);

  return particles;
}
示例#3
0
文件: pic.c 项目: ParRes/Kernels
/* Initializes  particles with geometric distribution */
particle_t *initializeGeometric(uint64_t n_input, uint64_t L, double rho,
                                double k, double m, uint64_t *n_placed, 
                                random_draw_t *parm){
  particle_t  *particles;
  uint64_t    x, y, p, pi, actual_particles;
  double      A;

  /* initialize random number generator */
  LCG_init(parm);

  /* first determine total number of particles, then allocate and place them   */

  /* Each cell in the i-th column of cells contains p(i) = A * rho^i particles */
  A = n_input * ((1.0-rho) / (1.0-pow(rho,L))) / (double)L;
  for (*n_placed=0,x=0; x<L; x++) {
    for (y=0; y<L; y++) {
      (*n_placed) += random_draw(A * pow(rho, x), parm);
    }
  }

  particles = (particle_t*) prk_malloc((*n_placed) * sizeof(particle_t));
  if (particles == NULL) {
    printf("ERROR: Could not allocate space for particles\n");
    exit(EXIT_FAILURE);
  }

  /* Re-initialize random number generator */
  LCG_init(parm);

  A = n_input * ((1.0-rho) / (1.0-pow(rho,L))) / (double)L;
  for (pi=0,x=0; x<L; x++) {
    for (y=0; y<L; y++) {
      actual_particles = random_draw(A * pow(rho, x), parm);
      for (p=0; p<actual_particles; p++,pi++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
      }
    }
  }
  finish_distribution((*n_placed), particles);

  return particles;
}
示例#4
0
文件: pic.c 项目: ParRes/Kernels
/* Initialize particles with a sinusoidal distribution */
particle_t *initializeSinusoidal(uint64_t n_input, uint64_t L,
                                 double k, double m, uint64_t *n_placed, 
                                random_draw_t *parm){
  particle_t  *particles;
  double      step = PRK_M_PI/L;
  uint64_t    x, y, p, pi, actual_particles;

  /* initialize random number generator */
  LCG_init(parm);

  /* first determine total number of particles, then allocate and place them   */

  /* Loop over columns of cells and assign number of particles proportional to sinusodial weight */
  for (*n_placed=0,x=0; x<L; x++) {
    for (y=0; y<L; y++) {
      (*n_placed) += random_draw(2.0*cos(x*step)*cos(x*step)*n_input/(L*L), parm);
    }
  }

  particles = (particle_t*) prk_malloc((*n_placed) * sizeof(particle_t));
  if (particles == NULL) {
    printf("ERROR: Could not allocate space for particles\n");
    exit(EXIT_FAILURE);
  }

  /* Re-initialize random number generator */
  LCG_init(parm);

  for (pi=0,x=0; x<L; x++) {
    for (y=0; y<L; y++) {
      actual_particles = random_draw(2.0*cos(x*step)*cos(x*step)*n_input/(L*L), parm);
      for (p=0; p<actual_particles; p++,pi++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
      }
    }
  }
  finish_distribution((*n_placed), particles);

  return particles;
}
示例#5
0
文件: pic.c 项目: afanfa/Kernels
/* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/
particle_t *initializeLinear(uint64_t n_input, uint64_t L, double alpha, double beta, 
                             bbox_t tile, double k, double m, 
                             uint64_t *n_placed, uint64_t *n_size) {
  particle_t  *particles;
  double      total_weight, step, current_weight;
  uint64_t     x, y, p, pi, actual_particles, start_index;
   
  /* initialize random number generator */
  LCG_init();  

  /* First, find sum of all weights in order to normalize the number of particles */
  step         = 1.0/(L-1);
  total_weight = beta*L-alpha*0.5*step*L*(L-1);
   
  /* Loop over columns of cells and assign number of particles proportional linear weight */
  for (*n_placed=0,x=tile.left; x<tile.right; x++) {
    current_weight = (beta - alpha * step * ((double) x));
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index, 0);
    for (y=tile.bottom; y<tile.top; y++) {
      (*n_placed) += random_draw(n_input * (current_weight/total_weight) / L);
    }
  }

  /* use some slack in allocating memory to avoid fine-grain memory management */
  (*n_size) = ((*n_placed)*(1+MEMORYSLACK))/MEMORYSLACK;
  particles = (particle_t*) prk_malloc((*n_size) * sizeof(particle_t));
  if (particles == NULL) return(particles);

  for (pi=0,x=tile.left; x<tile.right; x++) {
    current_weight = (beta - alpha * step * ((double) x));
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index,0);
    for (y=tile.bottom; y<tile.top; y++) {
      actual_particles = random_draw(n_input * (current_weight/total_weight) / L);
      for (p=0; p<actual_particles; p++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;        
        pi++;
      }
    }
  }
  finishParticlesInitialization((*n_placed), particles);
  return particles;
}
示例#6
0
文件: pic.c 项目: afanfa/Kernels
/* Initializes the particles following the geometric distribution as described in the spec */
particle_t *initializeGeometric(uint64_t n_input, uint64_t L, double rho, 
                                bbox_t tile, double k, double m,
		                uint64_t *n_placed, uint64_t *n_size) {
  particle_t  *particles;
  double      A;
  uint64_t    x, y, p, pi, actual_particles, start_index;

  /* initialize random number generator */
  LCG_init();  
   
  /* first determine total number of particles, then allocate and place them               */
  /* Each cell in the i-th column of cells contains p(i) = A * rho^i particles */
  A = n_input * ((1.0-rho) / (1.0-pow(rho, L))) / (double) L;

  for (*n_placed=0,x=tile.left; x<tile.right; x++) {
    /* at start of each grid column we jump into sequence of random numbers */
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index, 0);
    for (y=tile.bottom; y<tile.top; y++) {
      (*n_placed) += random_draw(A * pow(rho, x));
    }
  }

  /* use some slack in allocating memory to avoid fine-grain memory management */
  (*n_size) = ((*n_placed)*(1+MEMORYSLACK))/MEMORYSLACK;
  particles = (particle_t*) prk_malloc((*n_size) * sizeof(particle_t));
  if (particles == NULL) return(particles);

  for (pi=0,x=tile.left; x<tile.right; x++) {
    /* at start of each grid column we jump into sequence of random numbers */
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index, 0);
    for (y=tile.bottom; y<tile.top; y++) {
      actual_particles = random_draw(A * pow(rho, x));
      for (p=0; p<actual_particles; p++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
        pi++;
      }
    }
  }
  finishParticlesInitialization((*n_placed), particles);
   
  return particles;
}
示例#7
0
文件: pic.c 项目: afanfa/Kernels
/* Initialize uniformly particles within a "patch" */
particle_t *initializePatch(uint64_t n_input, uint64_t L, bbox_t patch, 
                            bbox_t tile, double k, double m,
                            uint64_t *n_placed, uint64_t *n_size) {
  particle_t *particles;
  uint64_t   x, y, total_cells, pi, p, actual_particles, start_index;
  double     particles_per_cell;
   
  /* initialize random number generator */
  LCG_init();  

  total_cells  = (patch.right - patch.left+1)*(patch.top - patch.bottom+1);
  particles_per_cell = (double) n_input/total_cells;
   
  /* Loop over columns of cells and assign number of particles if inside patch */
  for (*n_placed=0,x=tile.left; x<tile.right; x++) {
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index, 0);
    for (y=tile.bottom; y<tile.top; y++) {
      if (contain(x,y,patch)) (*n_placed) += random_draw(particles_per_cell);
      else                    (*n_placed) += random_draw(0.0);
    }
  }

  /* use some slack in allocating memory to avoid fine-grain memory management */
  (*n_size) = ((*n_placed)*(1+MEMORYSLACK))/MEMORYSLACK;
  particles = (particle_t*) prk_malloc((*n_size) * sizeof(particle_t));
  if (particles == NULL) return(particles);

  for (pi=0,x=tile.left; x<tile.right; x++) {
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index,0);
    for (y=tile.bottom; y<tile.top; y++) {
      actual_particles = random_draw(particles_per_cell);
      if (!contain(x,y,patch)) actual_particles = 0;
      for (p=0; p<actual_particles; p++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
        pi++;
      }
    }
  }
  finishParticlesInitialization((*n_placed), particles);
  return particles;
}
示例#8
0
文件: pic.c 项目: afanfa/Kernels
/* Initialize with a sinusodial particle distribution */
particle_t *initializeSinusoidal(uint64_t n_input, uint64_t L, 
                                 bbox_t tile, double k, double m,
                                 uint64_t *n_placed, uint64_t *n_size) {
  particle_t  *particles;
  double      step;
  uint64_t     x, y, pi, p, actual_particles, start_index;
   
  /* initialize random number generator */
  LCG_init();

  step = PRK_M_PI/L;
  /* Place number of particles to each cell to form distribution decribed in spec.         */
  for ((*n_placed)=0,x=tile.left; x<tile.right; x++) {
    /* at start of each grid column we jump into sequence of random numbers */
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index, 0);
    for (y=tile.bottom; y<tile.top; y++) {
      (*n_placed) += random_draw(2.0*cos(x*step)*cos(x*step)*n_input/(L*L));
    }
  }
   
  /* use some slack in allocating memory to avoid fine-grain memory management */
  (*n_size) = ((*n_placed)*(1+MEMORYSLACK))/MEMORYSLACK;
  particles = (particle_t*) prk_malloc((*n_size) * sizeof(particle_t));
  if (particles == NULL) return(particles);

  for (pi=0,x=tile.left; x<tile.right; x++) {
    /* at start of each grid column we jump into sequence of random numbers */
    start_index = tile.bottom+x*L;
    LCG_jump(2*start_index, 0);
    for (y=tile.bottom; y<tile.top; y++) {
      actual_particles = random_draw(2.0*cos(x*step)*cos(x*step)*n_input/(L*L));
      for (p=0; p<actual_particles; p++) {
        particles[pi].x = x + REL_X;
        particles[pi].y = y + REL_Y;
        particles[pi].k = k;
        particles[pi].m = m;
        pi++;
      }
    }
  }
  finishParticlesInitialization((*n_placed), particles);   
  return particles;
}
示例#9
0
文件: pic.c 项目: ParRes/Kernels
int main(int argc, char ** argv) {

  int         args_used = 1;     // keeps track of # consumed arguments
  uint64_t    L;                 // dimension of grid in cells
  uint64_t    iterations;        // total number of simulation steps
  uint64_t    n;                 // total number of particles in the simulation
  char        *init_mode;        // particle initialization mode (char)
  uint64_t    particle_mode;     // particle initialization mode (int)
  double      rho;               // attenuation factor for geometric particle distribution
  int64_t     k, m;              // determine initial horizontal and vertical velocity of
                                 // particles-- (2*k)+1 cells per time step
  double      alpha, beta;       // slope and offset values for linear particle distribution
  bbox_t      grid_patch,        // whole grid
              init_patch;        // subset of grid used for localized initialization
  int         correctness = 1;   // determines whether simulation was correct
  double      *Qgrid;            // field of fixed charges
  particle_t  *particles, *p;    // the particles array
  uint64_t    iter, i;           // dummies
  double      fx, fy, ax, ay;    // forces and accelerations
#if UNUSED
  int         particles_per_cell;// number of particles per cell to be injected
  int         error=0;           // used for graceful exit after error
#endif
  double      avg_time, pic_time;// timing parameters
  int         nthread_input,     // thread parameters
              nthread;
  int         num_error=0;       // flag that signals that requested and obtained
                                 // numbers of threads are the same
  random_draw_t dice;

  printf("Parallel Research Kernels Version %s\n", PRKVERSION);
  printf("OpenMP Particle-in-Cell execution on 2D grid\n");

  /*******************************************************************************
  ** process and test input parameters
  ********************************************************************************/

  if (argc<7) {
    printf("Usage: %s <#threads> <#simulation steps> <grid size> <#particles> <k (particle charge semi-increment)> ", argv[0]);
    printf("<m (vertical particle velocity)>\n");
    printf("          <init mode> <init parameters>]\n");
    printf("   init mode \"GEOMETRIC\"  parameters: <attenuation factor>\n");
    printf("             \"SINUSOIDAL\" parameters: none\n");
    printf("             \"LINEAR\"     parameters: <negative slope> <constant offset>\n");
    printf("             \"PATCH\"      parameters: <xleft> <xright>  <ybottom> <ytop>\n");
    exit(SUCCESS);
  }

  /* Take number of threads to request from command line */
  nthread_input = atoi(*++argv);

  if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) {
    printf("ERROR: Invalid number of threads: %d\n", nthread_input);
    exit(EXIT_FAILURE);
  }

  omp_set_num_threads(nthread_input);

  iterations = atol(*++argv);  args_used++;
  if (iterations<1) {
    printf("ERROR: Number of time steps must be positive: %" PRIu64 "\n", iterations);
    exit(FAILURE);
  }
  L = atol(*++argv);  args_used++;
  if (L<1 || L%2) {
    printf("ERROR: Number of grid cells must be positive and even: %" PRIu64 "\n", L);
    exit(FAILURE);
  }

  grid_patch = (bbox_t){0, L+1, 0, L+1};
  n = atol(*++argv);  args_used++;
  if (n<1) {
    printf("ERROR: Number of particles must be positive: %" PRIu64 "\n", n);
    exit(FAILURE);
  }

  particle_mode  = UNDEFINED;
  k = atoi(*++argv);   args_used++;
  if (k<0) {
    printf("ERROR: Particle semi-charge must be non-negative: %" PRIu64 "\n", k);
    exit(FAILURE);
  }
  m = atoi(*++argv);   args_used++;
  init_mode = *++argv; args_used++;

  /* Initialize particles with geometric distribution */
  if (strcmp(init_mode, "GEOMETRIC") == 0) {
    if (argc<args_used+1) {
      printf("ERROR: Not enough arguments for GEOMETRIC\n");
      exit(FAILURE);
    }
    particle_mode = GEOMETRIC;
    rho = atof(*++argv);   args_used++;
  }

  /* Initialize with a sinusoidal particle distribution (single period) */
  if (strcmp(init_mode, "SINUSOIDAL") == 0) {
    particle_mode = SINUSOIDAL;
  }

  /* Initialize particles with linear distribution */
  /* The linear function is f(x) = -alpha * x + beta , x in [0,1]*/
  if (strcmp(init_mode, "LINEAR") == 0) {
    if (argc<args_used+2) {
      printf("ERROR: Not enough arguments for LINEAR initialization\n");
      exit(EXIT_FAILURE);
    }
    particle_mode = LINEAR;
    alpha = atof(*++argv); args_used++;
    beta  = atof(*++argv); args_used++;
    if (beta <0 || beta<alpha) {
      printf("ERROR: linear profile gives negative particle density\n");
      exit(EXIT_FAILURE);
    }
  }

  /* Initialize particles uniformly within a "patch" */
  if (strcmp(init_mode, "PATCH") == 0) {
    if (argc<args_used+4) {
      printf("ERROR: Not enough arguments for PATCH initialization\n");
      exit(FAILURE);
    }
    particle_mode = PATCH;
    init_patch.left   = atoi(*++argv); args_used++;
    init_patch.right  = atoi(*++argv); args_used++;
    init_patch.bottom = atoi(*++argv); args_used++;
    init_patch.top    = atoi(*++argv); args_used++;
    if (bad_patch(&init_patch, &grid_patch)) {
      printf("ERROR: inconsistent initial patch\n");
      exit(FAILURE);
    }
  }

  #pragma omp parallel
  {

  #pragma omp master
  {
  nthread = omp_get_num_threads();

  if (nthread != nthread_input) {
    num_error = 1;
    printf("ERROR: number of requested threads %d does not equal ",
           nthread_input);
    printf("number of spawned threads %d\n", nthread);
  }
  else {
    printf("Number of threads              = %d\n",nthread_input);
    printf("Grid size                      = %lld\n", L);
    printf("Number of particles requested  = %lld\n", n);
    printf("Number of time steps           = %lld\n", iterations);
    printf("Initialization mode            = %s\n", init_mode);

    switch(particle_mode) {
    case GEOMETRIC: printf("  Attenuation factor           = %lf\n", rho);    break;
    case SINUSOIDAL:                                                          break;
    case LINEAR:    printf("  Negative slope               = %lf\n", alpha);
                    printf("  Offset                       = %lf\n", beta);   break;
    case PATCH:     printf("  Bounding box                 = %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n",
                           init_patch.left, init_patch.right,
                           init_patch.bottom, init_patch.top);                break;
    default:        printf("ERROR: Unsupported particle initializating mode\n");
                     exit(FAILURE);
    }
    printf("Particle charge semi-increment = %"PRIu64"\n", k);
    printf("Vertical velocity              = %"PRIu64"\n", m);

    /* Initialize grid of charges and particles */
    Qgrid = initializeGrid(L);

    LCG_init(&dice);
    switch(particle_mode) {
    case GEOMETRIC:  particles = initializeGeometric(n, L, rho, k, m, &n, &dice);      break;
    case SINUSOIDAL: particles = initializeSinusoidal(n, L, k, m, &n, &dice);          break;
    case LINEAR:     particles = initializeLinear(n, L, alpha, beta, k, m, &n, &dice); break;
    case PATCH:      particles = initializePatch(n, L, init_patch, k, m, &n, &dice);   break;
    default:         printf("ERROR: Unsupported particle distribution\n");  exit(FAILURE);
    }

    printf("Number of particles placed     = %lld\n", n);
  }
  }
  bail_out(num_error);
  }

  for (iter=0; iter<=iterations; iter++) {

    /* start the timer after one warm-up time step */
    if (iter==1) {
      pic_time = wtime();
    }

    /* Calculate forces on particles and update positions */
    #pragma omp parallel for private(i, p, fx, fy, ax, ay)
    for (i=0; i<n; i++) {
      p = particles;
      fx = 0.0;
      fy = 0.0;
      computeTotalForce(p[i], L, Qgrid, &fx, &fy);
      ax = fx * MASS_INV;
      ay = fy * MASS_INV;

      /* Update particle positions, taking into account periodic boundaries */
      p[i].x = fmod(p[i].x + p[i].v_x*DT + 0.5*ax*DT*DT + L, L);
      p[i].y = fmod(p[i].y + p[i].v_y*DT + 0.5*ay*DT*DT + L, L);

      /* Update velocities */
      p[i].v_x += ax * DT;
      p[i].v_y += ay * DT;
    }
  }

  pic_time = wtime() - pic_time;

  /* Run the verification test */
  for (i=0; i<n; i++) {
    correctness *= verifyParticle(particles[i], iterations, Qgrid, L);
  }

  if (correctness) {
    printf("Solution validates\n");
#ifdef VERBOSE
    printf("Simulation time is %lf seconds\n", pic_time);
#endif
    avg_time = n*iterations/pic_time;
    printf("Rate (Mparticles_moved/s): %lf\n", 1.0e-6*avg_time);
  } else {
    printf("Solution does not validate\n");
  }

  return(EXIT_SUCCESS);
}
示例#10
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  long Block_size;         /* size of a single block                */
  long Colblock_size;      /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  long order;              /* order of overall matrix               */
  int send_to, recv_from;  /* ranks with which to communicate       */
#if !SYNCHRONOUS
  MPI_Request send_req;
  MPI_Request recv_req;
#endif
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  int i, j, it, jt, istart;/* dummies                               */
  int iter, iter_init;     /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double * RESTRICT A_p;   /* original matrix column block          */
  double * RESTRICT B_p;   /* transposed matrix column block        */
  double * RESTRICT Work_in_p;/* workspace for transpose function   */
  double * RESTRICT Work_out_p;/* workspace for transpose function  */
  double abserr,           /* absolute error                        */
         abserr_tot;       /* aggregate absolute error              */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double transpose_time,   /* timing parameters                     */
         avgtime;
  int    spare_ranks;      /* number of ranks to keep in reserve                  */
  int    kill_ranks;       /* number of ranks that die with each failure          */
  int    *kill_set;        /* instance of set of ranks to be killed        */
  int    kill_period;      /* average number of iterations between failures       */
  int    *fail_iter;       /* list of iterations when a failure will be triggered */
  int    fail_iter_s=0;    /* latest  */
  double init_add, addit;  /* used to offset initial solutions       */
  int    checkpointing;    /* indicates if data is restored using Fenix or
                             analytically                            */
  int    num_fenix_init=1; /* number of times Fenix_Init is called   */
  int    num_fenix_init_loc;/* number of times Fenix_Init was called */
  int    fenix_status;
  random_draw_t dice;

/*********************************************************************
** Initialize the MPI environment
*********************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI matrix transpose with Fenix fault tolerance: B = A^T\n");

    if (argc != 7 && argc != 8){
      printf("Usage: %s <# iterations> <matrix order> <spare ranks> ",
                                                               *argv);
      printf("<kill set size> <kill period> <checkpointing> [Tile size]\n",
                                                               *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(argv[1]);
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atol(argv[2]);
    spare_ranks  = atoi(argv[3]);
    if (order < Num_procs-spare_ranks) {
      printf("ERROR: matrix order %ld should at least # procs %d\n",
             order, Num_procs-spare_ranks);
      error = 1; goto ENDOFTESTS;
    }
    if (order%(Num_procs-spare_ranks)) {
      printf("ERROR: matrix order %ld should be divisible by # procs %d\n",
             order, Num_procs-spare_ranks);
      error = 1; goto ENDOFTESTS;
    }

    if (spare_ranks < 0 || spare_ranks >= Num_procs){
      printf("ERROR: Illegal number of spare ranks : %d \n", spare_ranks);
      error = 1;
      goto ENDOFTESTS;     
    }

    kill_ranks = atoi(argv[4]);
    if (kill_ranks < 0 || kill_ranks > spare_ranks) {
      printf("ERROR: Number of ranks in kill set invalid: %d\n", kill_ranks);
      error = 1;
      goto ENDOFTESTS;     
    }

    kill_period = atoi(argv[5]);
    if (kill_period < 1) {
      printf("ERROR: rank kill period must be positive: %d\n", kill_period);
      error = 1;
      goto ENDOFTESTS;     
    }

    checkpointing = atoi(argv[6]);
    if (checkpointing) {
      printf("ERROR: Fenix checkpointing not yet implemented\n");
      error = 1;
      goto ENDOFTESTS;     
    }

    if (argc == 8) Tile_order = atoi(argv[7]);

    ENDOFTESTS:;
  }
  bail_out(error);

  /*  Broadcast input data to all ranks */
  MPI_Bcast(&order,         1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations,    1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast(&Tile_order,    1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast(&spare_ranks,   1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&kill_ranks,    1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&kill_period,   1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&checkpointing, 1, MPI_INT, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    printf("Number of ranks       = %d\n", Num_procs);
    printf("Matrix order          = %ld\n", order);
    printf("Number of iterations  = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size             = %d\n", Tile_order);
    else  printf("Untiled\n");
#if !SYNCHRONOUS
    printf("Non-");
#endif
    printf("Blocking messages\n");
    printf("Number of spare ranks = %d\n", spare_ranks);
    printf("Kill set size         = %d\n", kill_ranks);
    printf("Fault period          = %d\n", kill_period);
    if (checkpointing)
      printf("Data recovery         = Fenix checkpointing\n");
    else
      printf("Data recovery         = analytical\n");
  }

  /* initialize the random number generator for each rank; we do that before
     starting Fenix, so that all ranks, including spares, are initialized      */
  LCG_init(&dice);
  /* compute the iterations during which errors will be incurred               */
  for (iter=0; iter<=iterations; iter++) {
    fail_iter_s += random_draw(kill_period, &dice);
    if (fail_iter_s > iterations) break;
    num_fenix_init++;
  }
  if ((num_fenix_init-1)*kill_ranks>spare_ranks) {
    if (my_ID==0) printf("ERROR: number of injected errors %d exceeds spare ranks %d\n",
                         (num_fenix_init-1)*kill_ranks, spare_ranks);
    error = 1;
  }
  else if(my_ID==root) printf("Total injected failures  = %d times %d errors\n", 
                           num_fenix_init-1, kill_ranks);
  bail_out(error);
  if ((num_fenix_init-1)*kill_ranks>=Num_procs-spare_ranks) if (my_ID==root)
  printf("WARNING: All active ranks will be replaced by recovered ranks; timings not valid\n");

  fail_iter = (int *) prk_malloc(sizeof(int)*num_fenix_init);
  if (!fail_iter) {
    printf("ERROR: Rank %d could not allocate space for array fail_iter\n", my_ID);
    error = 1;
  }
  bail_out(error);

  /* reinitialize random number generator to obtain identical error series     */
  LCG_init(&dice);
  /* now record the actual failure iterations                                  */
  for (fail_iter_s=iter=0; iter<num_fenix_init; iter++) {
    fail_iter_s += random_draw(kill_period, &dice);
    fail_iter[iter] = fail_iter_s;
  }

  /* Here is where we initialize Fenix and mark the return point after failure */
  Fenix_Init(&fenix_status, MPI_COMM_WORLD, NULL, &argc, &argv, spare_ranks, 
             0, MPI_INFO_NULL, &error);

  if (error==FENIX_WARNING_SPARE_RANKS_DEPLETED) 
    printf("ERROR: Rank %d: Cannot reconstitute original communicator\n", my_ID);
  bail_out(error);

  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  /* if rank is recovered, set iter to a negative number, to be increased
     to the actual value corresponding to the current iter value among
     survivor ranks; handle number of Fenix_Init calls similarly               */
  switch (fenix_status){
    case FENIX_ROLE_INITIAL_RANK:   iter_init = num_fenix_init_loc = 0;    break;
    case FENIX_ROLE_RECOVERED_RANK: iter_init = num_fenix_init_loc = iterations+1;   break;
    case FENIX_ROLE_SURVIVOR_RANK:  iter_init = iter;  num_fenix_init_loc++;
  }

  MPI_Allreduce(&iter_init, &iter, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
  MPI_Allreduce(&num_fenix_init_loc, &num_fenix_init, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a
** rank.  Each column block is made up of Num_procs smaller square
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  if (fenix_status != FENIX_ROLE_SURVIVOR_RANK) {
    A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
    if (A_p == NULL){
      printf(" Error allocating space for original matrix on node %d\n",my_ID);
      error = 1;
    }
  }
  bail_out(error);

  if (fenix_status != FENIX_ROLE_SURVIVOR_RANK) {
    B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
    if (B_p == NULL){
      printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
      error = 1;
    }
  }
  bail_out(error);

  if (fenix_status != FENIX_ROLE_SURVIVOR_RANK && Num_procs>1) {
    Work_in_p   = (double *)prk_malloc(2*Block_size*sizeof(double));
    if (Work_in_p == NULL){
      printf(" Error allocating space for work on node %d\n",my_ID);
      error = 1;
    }
    Work_out_p = Work_in_p + Block_size;
  }
  bail_out(error);

  /* Fill the original column matrix                                                */
  /* intialize the input and output arrays, note that if we use the analytical
     solution to initialize, one might be tempted to skip this step for survivor
     ranks, because they already have the correct (interim) values. That would
     be wrong for two reasons: It is possible for ranks to be in different time
     steps at the same time, and it is possible that error signal delivery to
     a rank is delayed                                                         */
  if (checkpointing) {
    init_add = 0.0;
    addit    = 0.0;
  }
  else {
    init_add = (double) iter;
    addit = ((double)(iter-1) * (double) (iter))/2.0;
  }
  istart = 0;
  for (j=0;j<Block_order;j++)
    for (i=0;i<order; i++)  {
      A(i,j) = (double) (order*(j+colstart) + i) + init_add;
      B(i,j) = ((double) ((j+colstart) + order*i)*iter + addit);
  }

  for (; iter<=iterations; iter++){

    /* start timer after a warmup iteration                                        */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      transpose_time = wtime();
    }

    /* inject failure if appropriate                                                */
    if (iter == fail_iter[num_fenix_init]) {
      pid_t pid = getpid();
      if (my_ID < kill_ranks) {
#if VERBOSE
        printf("Rank %d, pid %d commits suicide in iter %d\n", my_ID, pid, iter);
#endif
        kill(pid, SIGKILL);
      }
#if VERBOSE
      else printf("Rank %d, pid %d is survivor rank in iter %d\n", my_ID, pid, iter);
#endif
    }

    time_step(Block_order, Block_size, Colblock_size, Tile_order, tiling,
              Num_procs, order, my_ID, colstart, A_p, B_p, Work_in_p, Work_out_p);

  } /* end of iterations */

  MPI_Barrier(MPI_COMM_WORLD);
  transpose_time = wtime() - transpose_time;;

  abserr = 0.0;
  istart = 0;
  addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) for (i=0;i<order; i++) {
      abserr += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit));
  }

  root = Num_procs-1;
  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = transpose_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#if VERBOSE
      printf("Summed errors: %f \n", abserr);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %lf exceeds threshold %e\n", abserr, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  Fenix_Finalize();
  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */