示例#1
0
int main(int argc, char ** argv)
{
  int Block_order;
  size_t Block_size;
  size_t Colblock_size;
  int Tile_order=32;
  int tiling;
  int Num_procs;     /* Number of ranks                                          */
  int order;         /* overall matrix order                                     */
  int send_to, recv_from; /* communicating ranks                                 */
  size_t bytes;      /* total amount of data to be moved                         */
  int my_ID;         /* rank                                                     */
  int root=0;        /* root rank of a communicator                              */
  int iterations;    /* number of times to run the pipeline algorithm            */
  int i, j, it, jt, ID;/* dummies                                                */
  int iter;          /* index of iteration                                       */
  int phase;         /* phase in the staged communication                        */
  size_t colstart;   /* sequence number of first column owned by calling rank    */
  int error=0;       /* error flag                                               */
  double *A_p;       /* original matrix column block                             */
  double *B_p;       /* transposed matrix column block                           */
  double *Work_in_p; /* workspace for the transpose function                     */
  double *Work_out_p;/* workspace for the transpose function                     */
  double abserr, abserr_tot; /* computed error                                   */
  double epsilon = 1.e-8; /* error tolerance                                     */
  double local_trans_time, /* timing parameters                                  */
         trans_time,
         avgtime;
  MPI_Status status; /* completion status of message                             */
  MPI_Win shm_win_A; /* Shared Memory window object                              */
  MPI_Win shm_win_B; /* Shared Memory window object                              */
  MPI_Win shm_win_Work_in; /* Shared Memory window object                        */
  MPI_Win shm_win_Work_out; /* Shared Memory window object                       */
  MPI_Info rma_winfo;/* info for window                                          */
  MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator                      */
  MPI_Comm shm_comm; /* Shared Memory Communicator                               */
  int shm_procs;     /* # of ranks in shared domain                              */
  int shm_ID;        /* MPI rank within coherence domain                         */
  int group_size;    /* number of ranks per shared memory group                  */
  int Num_groups;    /* number of shared memory group                            */
  int group_ID;      /* sequence number of shared memory group                   */
  int size_mul;      /* size multiplier; 0 for non-root ranks in coherence domain*/
  int istart;
  MPI_Request send_req, recv_req;

/*********************************************************************************
** Initialize the MPI environment
**********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  root = 0;

/*********************************************************************
** process, test and broadcast input parameter
*********************************************************************/

  if (my_ID == root){
    if (argc != 4 && argc !=5){
      printf("Usage: %s  <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    } 
    if (Num_procs%group_size) {
      printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    } 

    iterations = atoi(*++argv);
    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    } 

    order = atoi(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc == 5) Tile_order = atoi(*++argv);

    ENDOFTESTS:;
  }
  bail_out(error); 

  /*  Broadcast input data to all ranks */
  MPI_Bcast(&order,      1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM Matrix transpose: B = A^T\n");
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Rank group size      = %d\n", group_size);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
       printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
#ifndef SYNCHRONOUS
    printf("Non-");
#endif
    printf("Blocking messages\n");
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that a SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Num_groups     = Num_procs/group_size;
  Block_order    = order/Num_groups;

  group_ID       = my_ID/group_size;
  colstart       = Block_order * group_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the column block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/

  /* RMA win info */
  MPI_Info_create(&rma_winfo);
  /* This key indicates that passive target RMA will not be used.
   * It is the one info key that MPICH actually uses for optimization. */
  MPI_Info_set(rma_winfo, "no_locks", "true");

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);  
  int offset = 32;
  MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit;
  MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, 
                          (void *) &A_p, &shm_win_A);
  MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_A);
  MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p);

  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);
  A_p += offset;

  /* recompute memory size (overwritten by prior query                 */
  size= (Colblock_size+offset)*sizeof(double)*size_mul; 
  MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, 
                          (void *) &B_p, &shm_win_B);
  MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_B);
  MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p);
  if (B_p == NULL){
    printf(" Error allocating space for transposed matrix by group %d\n",group_ID);
    error = 1;
  }
  bail_out(error);
  B_p += offset;

  if (Num_groups>1) {

    size = Block_size*sizeof(double)*size_mul;
    MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, 
                           (void *) &Work_in_p, &shm_win_Work_in);
    MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_in);
    MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, 
                         (void *)&Work_in_p);
    if (Work_in_p == NULL){
      printf(" Error allocating space for in block by group %d\n",group_ID);
      error = 1;
    }
    bail_out(error);

    /* recompute memory size (overwritten by prior query                 */
    size = Block_size*sizeof(double)*size_mul;
    MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, 
                            shm_comm, (void *) &Work_out_p, &shm_win_Work_out);
    MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_out);
    MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, 
                         (void *)&Work_out_p);
    if (Work_out_p == NULL){
      printf(" Error allocating space for out block by group %d\n",group_ID);
      error = 1;
    }
    bail_out(error);
  }

  /* Fill the original column matrix                                             */
  istart = 0;  
  int chunk_size = Block_order/group_size;
  if (tiling) {
      for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) {
      for (i=0;i<order; i+=Tile_order) 
        for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++)
          for (it=i; it<MIN(order,i+Tile_order); it++) {
            A(it,jt) = (double) ((double)order*(jt+colstart) + it);
            B(it,jt) = -1.0;
          }
    }
  }
  else {
    for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) 
      for (i=0;i<order; i++) {
        A(i,j) = (double)((double)order*(j+colstart) + i);
        B(i,j) = -1.0;
      }
  }
  /* NEED A STORE FENCE HERE                                                     */
  MPI_Win_sync(shm_win_A);
  MPI_Win_sync(shm_win_B);
  MPI_Barrier(shm_comm);

  for (iter=0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                    */
    istart = colstart; 
    if (!tiling) {
      for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) {
        for (j=0; j<Block_order; j++) 
              B(j,i) = A(i,j);
	}
    }
    else {
      for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) {
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) = A(it,jt); 
	    }
      }
    }

    for (phase=1; phase<Num_groups; phase++){
      recv_from = ((group_ID + phase             )%Num_groups);
      send_to   = ((group_ID - phase + Num_groups)%Num_groups);

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
	  }
      }
      else {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order)
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
	      }
      }

      /* NEED A LOAD/STORE FENCE HERE                                            */
      MPI_Win_sync(shm_win_Work_in);
      MPI_Win_sync(shm_win_Work_out);
      MPI_Barrier(shm_comm);
      if (shm_ID==0) {
#ifndef SYNCHRONOUS  
        /* if we place the Irecv outside this block, it would not be
           protected by a local barrier, which creates a race                    */
        MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, 
                  recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req);  
        MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size,
                  phase, MPI_COMM_WORLD, &send_req);
        MPI_Wait(&recv_req, &status);
        MPI_Wait(&send_req, &status);
#else
        MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, 
                     phase, Work_in_p, Block_size, MPI_DOUBLE, 
  	             recv_from*group_size, phase, MPI_COMM_WORLD, &status);
#endif
      }
      /* NEED A LOAD FENCE HERE                                                  */ 
      MPI_Win_sync(shm_win_Work_in);
      MPI_Win_sync(shm_win_Work_out);
      MPI_Barrier(shm_comm);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) = Work_in(i,j);

    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  /*  for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */
  for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
    for (i=0;i<order; i++) { 
      abserr += ABS(B(i,j) - (double)((double)order*i + j+colstart));
    }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr_tot);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  MPI_Win_unlock_all(shm_win_A);
  MPI_Win_unlock_all(shm_win_B);

  MPI_Win_free(&shm_win_A);
  MPI_Win_free(&shm_win_B);
  if (Num_groups>1) {
      MPI_Win_unlock_all(shm_win_Work_in);
      MPI_Win_unlock_all(shm_win_Work_out);
      MPI_Win_free(&shm_win_Work_in);
      MPI_Win_free(&shm_win_Work_out);
  }

  MPI_Info_free(&rma_winfo);

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
示例#2
0
文件: transpose.c 项目: LaHaine/ohpc
int main(int argc, char ** argv)
{
  int Block_order;         /* number of columns owned by rank       */
  int Block_size;          /* size of a single block                */
  int Colblock_size;       /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  int order;               /* order of overall matrix               */
  int send_to, recv_from;  /* ranks with which to communicate       */
  MPI_Status status;       
#ifndef SYNCHRONOUS
  MPI_Request send_req;
  MPI_Request recv_req;
#endif
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* ID of root rank                       */
  int iterations;          /* number of times to do the transpose   */
  int i, j, it, jt, istart;/* dummies                               */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int nthread_input,       /* thread parameters                     */
      nthread; 
  int error;               /* error flag                            */
  double *A_p;             /* original matrix column block          */
  double *B_p;             /* transposed matrix column block        */
  double *Work_in_p;       /* workspace for the transpose function  */
  double *Work_out_p;      /* workspace for the transpose function  */
  double abserr,           /* absolute error                        */
         abserr_tot;       /* aggregate absolute error              */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double local_trans_time, /* timing parameters                     */
         trans_time,
         avgtime;

/*********************************************************************
** Initialize the MPI environment
*********************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    if (argc != 4 && argc != 5){
      printf("Usage: %s <#threads><#iterations> <matrix order> [Tile size]\n",
                                                               *argv);
      error = 1; goto ENDOFTESTS;
    }

    /* Take number of threads to request from command line */
    nthread_input = atoi(*++argv); 
    if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) {
      printf("ERROR: Invalid number of threads: %d\n", nthread_input);
      error = 1; 
      goto ENDOFTESTS; 
    }

    iterations  = atoi(*++argv);
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atoi(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc == 5) Tile_order = atoi(*++argv);

    ENDOFTESTS:;
  }
  bail_out(error);
  /*  Broadcast input data to all ranks */
  MPI_Bcast(&order,         1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations,    1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&Tile_order,    1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&nthread_input, 1, MPI_INT, root, MPI_COMM_WORLD);

  omp_set_num_threads(nthread_input);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);

  if (my_ID == root) {
    printf("MPI+OpenMP matrix transpose: B = A^T\n");
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Number of threads    = %d\n", omp_get_max_threads());
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    if (tiling) {
      printf("Tile size            = %d\n", Tile_order);
#ifdef COLLAPSE
       printf("Using loop collapse\n");
    }
#endif
    else  printf("Untiled\n");
#ifndef SYNCHRONOUS
    printf("Non-");
#endif
    printf("Blocking messages\n");
  }

  bytes = 2.0 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  B_p = (double *)malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_in_p   = (double *)malloc(2*Block_size*sizeof(double));
    if (Work_in_p == NULL){
      printf(" Error allocating space for work on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);
    Work_out_p = Work_in_p + Block_size;
  }
  
  /* Fill the original column matrix                                                */
  istart = 0;  

  if (tiling) {
#ifdef COLLAPSE
    #pragma omp parallel for private (i,it,jt) collapse(2)
#else
    #pragma omp parallel for private (i,it,jt)
#endif
    for (j=0; j<Block_order; j+=Tile_order) 
      for (i=0; i<order; i+=Tile_order) 
        for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) 
          for (it=i; it<MIN(order,i+Tile_order); it++) {
            A(it,jt) = (double) (order*(jt+colstart) + it);
            B(it,jt) = -1.0;
          }
  }
  else {
    #pragma omp parallel for private (i)
    for (j=0;j<Block_order;j++) 
      for (i=0;i<order; i++) {
        A(i,j) = (double) (order*(j+colstart) + i);
        B(i,j) = -1.0;
    }
  }

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration                                        */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                       */
    istart = colstart; 
    if (!tiling) {
    #pragma omp parallel for private (j)
      for (i=0; i<Block_order; i++) 
        for (j=0; j<Block_order; j++) {
          B(j,i) = A(i,j);
	}
    }
    else {
#ifdef COLLAPSE
      #pragma omp parallel for private (j,it,jt) collapse(2)
#else
      #pragma omp parallel for private (j,it,jt)
#endif
      for (i=0; i<Block_order; i+=Tile_order) 
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) = A(it,jt); 
	    }
    }

    for (phase=1; phase<Num_procs; phase++){
      recv_from = (my_ID + phase            )%Num_procs;
      send_to   = (my_ID - phase + Num_procs)%Num_procs;

#ifndef SYNCHRONOUS
      MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, 
                recv_from, phase, MPI_COMM_WORLD, &recv_req);  
#endif

      istart = send_to*Block_order; 
      if (!tiling) {
        #pragma omp parallel for private (j)
        for (i=0; i<Block_order; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
	  }
      }
      else {
#ifdef COLLAPSE
        #pragma omp parallel for private (j,it,jt) collapse(2)
#else
        #pragma omp parallel for private (j,it,jt)
#endif
        for (i=0; i<Block_order; i+=Tile_order) 
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
	      }
      }

#ifndef SYNCHRONOUS  
      MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to,
                phase, MPI_COMM_WORLD, &send_req);
      MPI_Wait(&recv_req, &status);
      MPI_Wait(&send_req, &status);
#else
      MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to, phase,
                   Work_in_p, Block_size, MPI_DOUBLE, 
	           recv_from, phase, MPI_COMM_WORLD, &status);
#endif

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      #pragma omp parallel for private (i)
      for (j=0; j<Block_order; j++)
        for (i=0; i<Block_order; i++) {
          B(i,j) = Work_in(i,j);
        }

    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  #pragma omp parallel for private (i)
  for (j=0;j<Block_order;j++) for (i=0;i<order; i++) {
      abserr += ABS(B(i,j) - (double)(order*i + j+colstart));
  }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %lf exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
示例#3
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  long Block_size;         /* size of a single block                */
  long Colblock_size;      /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  long order;              /* order of overall matrix               */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  int i, j, it, jt, istart;/* dummies                               */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double RESTRICT *A_p;    /* original matrix column block          */
  double RESTRICT *B_p;    /* transposed matrix column block        */
  double RESTRICT *Work_in_p;/* workspace for transpose function    */
  double RESTRICT *Work_out_p;/* workspace for transpose function   */
  double abserr,           /* absolute error                        */
         abserr_tot;       /* aggregate absolute error              */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double local_trans_time, /* timing parameters                     */
         trans_time,
         avgtime;
  MPI_Win  rma_win = MPI_WIN_NULL;
  MPI_Info rma_winfo = MPI_INFO_NULL;
  int passive_target = 0;  /* use passive target RMA sync           */
#if MPI_VERSION >= 3
  int  flush_local  = 1;   /* flush local (or remote) after put     */
  int  flush_bundle = 1;   /* flush every <bundle> put calls        */
#endif

/*********************************************************************
** Initialize the MPI environment
*********************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPIRMA matrix transpose: B = A^T\n");

    if (argc <= 3){
      printf("Usage: %s <# iterations> <matrix order> [Tile size]"
             "[sync (0=fence, 1=flush)] [flush local?] [flush bundle]\n",
             *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atol(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %ld should at least # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %ld should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc >= 4) Tile_order     = atoi(*++argv);
    if (argc >= 5) passive_target = atoi(*++argv);
#if MPI_VERSION >= 3
    if (argc >= 6) flush_local    = atoi(*++argv);
    if (argc >= 7) flush_bundle   = atoi(*++argv);
#endif

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %ld\n", order);
    printf("Number of iterations = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
    if (passive_target) {
#if MPI_VERSION < 3
        printf("Synchronization      = MPI_Win_(un)lock\n");
#else
        printf("Synchronization      = MPI_Win_flush%s (bundle=%d)\n", flush_local ? "_local" : "", flush_bundle);
#endif
    } else {
        printf("Synchronization      = MPI_Win_fence\n");
    }
  }

  /*  Broadcast input data to all ranks */
  MPI_Bcast (&order,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast (&iterations,     1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&Tile_order,     1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&passive_target, 1, MPI_INT,  root, MPI_COMM_WORLD);
#if MPI_VERSION >= 3
  MPI_Bcast (&flush_local,    1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&flush_bundle,   1, MPI_INT,  root, MPI_COMM_WORLD);
#endif

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a
** rank.  Each column block is made up of Num_procs smaller square
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

  /* debug message size effects */
  if (my_ID == root) {
    printf("Block_size           = %ld\n", Block_size);
  }

/*********************************************************************
** Create the column block of the test matrix, the row block of the
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  MPI_Info_create (&rma_winfo);
  MPI_Info_set (rma_winfo, "no locks", "true");
  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_out_p = (double *) prk_malloc(Block_size*(Num_procs-1)*sizeof(double));
    if (Work_out_p == NULL){
      printf(" Error allocating space for work_out on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);

    PRK_Win_allocate(Block_size*(Num_procs-1)*sizeof(double), sizeof(double),
                     rma_winfo, MPI_COMM_WORLD, &Work_in_p, &rma_win);
    if (Work_in_p == NULL){
      printf(" Error allocating space for work on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);
  }

#if MPI_VERSION >= 3
  if (passive_target && Num_procs>1) {
    MPI_Win_lock_all(MPI_MODE_NOCHECK,rma_win);
  }
#endif

  /* Fill the original column matrix                                                */
  istart = 0;
  for (j=0;j<Block_order;j++) {
    for (i=0;i<order; i++) {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);

  for (iter = 0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration                                        */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart;
    if (!tiling) {
      for (i=0; i<Block_order; i++) {
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
        }
      }
    } else {
      for (i=0; i<Block_order; i+=Tile_order) {
        for (j=0; j<Block_order; j+=Tile_order) {
          for (it=i; it<MIN(Block_order,i+Tile_order); it++) {
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt);
              A(it,jt) += 1.0;
            }
          }
        }
      }
    }

    if (!passive_target && Num_procs>1) {
      MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, rma_win);
    }

    for (phase=1; phase<Num_procs; phase++){
      send_to = (my_ID - phase + Num_procs)%Num_procs;

      istart = send_to*Block_order;
      if (!tiling) {
        for (i=0; i<Block_order; i++) {
          for (j=0; j<Block_order; j++) {
            Work_out(phase-1,j,i) = A(i,j);
            A(i,j) += 1.0;
          }
        }
      } else {
        for (i=0; i<Block_order; i+=Tile_order) {
          for (j=0; j<Block_order; j+=Tile_order) {
            for (it=i; it<MIN(Block_order,i+Tile_order); it++) {
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(phase-1,jt,it) = A(it,jt);
                A(it,jt) += 1.0;
              }
            }
          }
        }
      }

#if MPI_VERSION < 3
      if (passive_target) {
          MPI_Win_lock(MPI_LOCK_SHARED, send_to, MPI_MODE_NOCHECK, rma_win);
      }
#endif
      MPI_Put(Work_out_p+Block_size*(phase-1), Block_size, MPI_DOUBLE, send_to,
              Block_size*(phase-1), Block_size, MPI_DOUBLE, rma_win);

      if (passive_target) {
#if MPI_VERSION < 3
        MPI_Win_unlock(send_to, rma_win);
#else
        if (flush_bundle==1) {
          if (flush_local==1) {
              MPI_Win_flush_local(send_to, rma_win);
          } else {
              MPI_Win_flush(send_to, rma_win);
          }
        } else if ( (phase%flush_bundle) == 0) {
          /* Too lazy to record all targets, so let MPI do it internally (hopefully) */
          if (flush_local==1) {
              MPI_Win_flush_local_all(rma_win);
          } else {
              MPI_Win_flush_all(rma_win);
          }
        }
#endif
      }
    }  /* end of phase loop for puts  */
    if (Num_procs>1) {
      if (passive_target) {
#if MPI_VERSION >= 3
          MPI_Win_flush_all(rma_win);
#endif
          MPI_Barrier(MPI_COMM_WORLD);
      } else {
          MPI_Win_fence(MPI_MODE_NOSTORE, rma_win);
      }
    }

    for (phase=1; phase<Num_procs; phase++) {
      recv_from = (my_ID + phase)%Num_procs;
      istart = recv_from*Block_order;
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++) {
        for (i=0; i<Block_order; i++) {
          B(i,j) += Work_in(phase-1,i,j);
        }
      }
    } /* end of phase loop for scatters */

    /* for the flush case we need to make sure we have consumed Work_in
       before overwriting it in the next iteration                    */
    if (Num_procs>1 && passive_target) {
      MPI_Barrier(MPI_COMM_WORLD);
    }

  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) {
    for (i=0;i<order; i++) {
      abserr += ABS(B(i,j) - ((double)(order*i + j+colstart)*(iterations+1)+addit));
    }
  }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
    }
    else {
      printf("ERROR: Aggregate absolute error %lf exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (rma_win!=MPI_WIN_NULL) {
#if MPI_VERSION >=3
    if (passive_target) {
      MPI_Win_unlock_all(rma_win);
    }
#endif
    PRK_Win_free(&rma_win);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
示例#4
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  int Block_size;          /* size of a single block                */
  int Colblock_size;       /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  int order;               /* order of overall matrix               */
  int bufferCount;         /* number of input buffers               */
  int targetBuffer;        /* buffer with which to communicate      */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  long i, j, it, jt, istart;/* dummies                              */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double *A_p;             /* original matrix column block          */
  double *B_p;             /* transposed matrix column block        */
  double **Work_in_p;      /* workspace for the transpose function  */
  double *Work_out_p;      /* workspace for the transpose function  */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double avgtime;          /* timing parameters                     */
  long   *pSync_bcast;     /* work space for collectives            */
  long   *pSync_reduce;    /* work space for collectives            */
  double *pWrk;            /* work space for SHMEM collectives      */
  double *local_trans_time, 
         *trans_time;      /* timing parameters                     */
  double *abserr, 
         *abserr_tot;      /* local and aggregate error             */
  int    *send_flag,
         *recv_flag;       /* synchronization flags                 */
  int    *arguments;       /* command line arguments                */

/*********************************************************************
** Initialize the SHMEM environment
*********************************************************************/

  prk_shmem_init();
  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM matrix transpose: B = A^T\n");
  }

// initialize sync variables for error checks
  pSync_bcast      = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce     = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk             = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
  local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  trans_time       = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  arguments        = (int *)    prk_shmem_align(prk_get_alignment(),4*sizeof(int));
  abserr           = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double));
  abserr_tot       = abserr + 1;
  if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time ||
      !trans_time || !arguments || !abserr) {
    printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID);
    error = 1;
    goto ENDOFTESTS;
  }

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    if (argc != 4 && argc != 5){
      printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n",
                                                               *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    arguments[0]=iterations;
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atoi(*++argv);
    arguments[1]=order;
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    bufferCount = atoi(*++argv);
    arguments[2]=bufferCount;
    if (Num_procs > 1) {
      if ((bufferCount < 1) || (bufferCount >= Num_procs)) {
        printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount);
        error = 1; goto ENDOFTESTS;
      }
    }

    if (argc == 5) Tile_order = atoi(*++argv);
    arguments[3]=Tile_order;

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    printf("Number of buffers    = %d\n", bufferCount);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
  }
  
  shmem_barrier_all();

  /*  Broadcast input data to all ranks */
  shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  order=arguments[1];
  bufferCount=arguments[2];
  Tile_order=arguments[3];

  shmem_barrier_all();
  prk_shmem_free(arguments);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_in_p   = (double**)prk_malloc(bufferCount*sizeof(double));
    Work_out_p = (double *) prk_malloc(Block_size*sizeof(double));
    recv_flag  = (int*)     prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int));
    if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){
      printf(" Error allocating space for work or flags on node %d\n",my_ID);
      error = 1;
    }

    if (bufferCount < (Num_procs - 1)) {
      send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int));

      if (send_flag == NULL) {
	printf("Error allocating space for flags on node %d\n", my_ID);
	error = 1;
      }
    }

    bail_out(error);

    for(i=0;i<bufferCount;i++) {
      Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double));
      if (Work_in_p[i] == NULL) {
        printf(" Error allocating space for work on node %d\n",my_ID);
        error = 1;
      }
      bail_out(error);
    }

    if (bufferCount < (Num_procs - 1)) {
      for(i=0;i<(Num_procs-1);i++)
        send_flag[i]=0;
    }

    for(i=0;i<bufferCount;i++)
      recv_flag[i]=0;
  }
  
  /* Fill the original column matrices                                              */
  istart = 0;  
  for (j=0;j<Block_order;j++) 
    for (i=0;i<order; i++)  {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
  }

  shmem_barrier_all();

  if (bufferCount < (Num_procs - 1)) {
    if (Num_procs > 1) {
      for ( i = 0; i < bufferCount; i++) {
        recv_from = (my_ID + i + 1)%Num_procs;
        shmem_int_inc(&send_flag[i], recv_from);
      }
    }
  }

  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration                                        */
    if (iter == 1) { 
      shmem_barrier_all();
      local_trans_time[0] = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart; 
    if (!tiling) {
      for (i=0; i<Block_order; i++) 
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
	}
    }
    else {
      for (i=0; i<Block_order; i+=Tile_order) 
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt); 
              A(it,jt) += 1.0;
            }
    }

    for (phase=1; phase<Num_procs; phase++){
      recv_from = (my_ID + phase            )%Num_procs;
      send_to   = (my_ID - phase + Num_procs)%Num_procs;

      targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount;

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=0; i<Block_order; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
            A(i,j) += 1.0;
	  }
      }
      else {
        for (i=0; i<Block_order; i+=Tile_order) 
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
                A(it,jt) += 1.0;
	      }
      }

      if (bufferCount < (Num_procs - 1))
        shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1);

      shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to);
      shmem_fence();
      shmem_int_inc(&recv_flag[targetBuffer], send_to);

      i = (iter * (Num_procs - 1) + phase) / bufferCount;

      if ((iter * (Num_procs - 1) + phase) % bufferCount)
	i++;

      shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) += Work_in(targetBuffer, i,j);

      if (bufferCount < (Num_procs - 1)) {
        if ((phase + bufferCount) < Num_procs)
	  recv_from = (my_ID + phase + bufferCount) % Num_procs;
        else
	  recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs;

        shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from);
      }
    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time[0] = wtime() - local_trans_time[0];

  shmem_barrier_all();
  shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  abserr[0] = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) for (i=0;i<order; i++) {
      abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit));
  }

  shmem_barrier_all();
  shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  if (my_ID == root) {
    if (abserr_tot[0] <= epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time[0]/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr[0]);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (Num_procs>1) 
  {
    if (bufferCount < (Num_procs - 1))
      prk_shmem_free(send_flag);

    prk_shmem_free(recv_flag);
    prk_free(Work_out_p);

    for(i=0;i<bufferCount;i++)
      prk_shmem_free(Work_in_p[i]);

    prk_free(Work_in_p);
  }

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk);

  prk_shmem_finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */