int main(int argc, char ** argv) { int Block_order; size_t Block_size; size_t Colblock_size; int Tile_order=32; int tiling; int Num_procs; /* Number of ranks */ int order; /* overall matrix order */ int send_to, recv_from; /* communicating ranks */ size_t bytes; /* total amount of data to be moved */ int my_ID; /* rank */ int root=0; /* root rank of a communicator */ int iterations; /* number of times to run the pipeline algorithm */ int i, j, it, jt, ID;/* dummies */ int iter; /* index of iteration */ int phase; /* phase in the staged communication */ size_t colstart; /* sequence number of first column owned by calling rank */ int error=0; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double *Work_in_p; /* workspace for the transpose function */ double *Work_out_p;/* workspace for the transpose function */ double abserr, abserr_tot; /* computed error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Status status; /* completion status of message */ MPI_Win shm_win_A; /* Shared Memory window object */ MPI_Win shm_win_B; /* Shared Memory window object */ MPI_Win shm_win_Work_in; /* Shared Memory window object */ MPI_Win shm_win_Work_out; /* Shared Memory window object */ MPI_Info rma_winfo;/* info for window */ MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of ranks in shared domain */ int shm_ID; /* MPI rank within coherence domain */ int group_size; /* number of ranks per shared memory group */ int Num_groups; /* number of shared memory group */ int group_ID; /* sequence number of shared memory group */ int size_mul; /* size multiplier; 0 for non-root ranks in coherence domain*/ int istart; MPI_Request send_req, recv_req; /********************************************************************************* ** Initialize the MPI environment **********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); root = 0; /********************************************************************* ** process, test and broadcast input parameter *********************************************************************/ if (my_ID == root){ if (argc != 4 && argc !=5){ printf("Usage: %s <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc == 5) Tile_order = atoi(*++argv); ENDOFTESTS:; } bail_out(error); /* Broadcast input data to all ranks */ MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM Matrix transpose: B = A^T\n"); printf("Number of ranks = %d\n", Num_procs); printf("Rank group size = %d\n", group_size); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); #ifndef SYNCHRONOUS printf("Non-"); #endif printf("Blocking messages\n"); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that a SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Num_groups = Num_procs/group_size; Block_order = order/Num_groups; group_ID = my_ID/group_size; colstart = Block_order * group_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the column block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ /* RMA win info */ MPI_Info_create(&rma_winfo); /* This key indicates that passive target RMA will not be used. * It is the one info key that MPICH actually uses for optimization. */ MPI_Info_set(rma_winfo, "no_locks", "true"); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); int offset = 32; MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &A_p, &shm_win_A); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_A); MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); A_p += offset; /* recompute memory size (overwritten by prior query */ size= (Colblock_size+offset)*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &B_p, &shm_win_B); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_B); MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p); if (B_p == NULL){ printf(" Error allocating space for transposed matrix by group %d\n",group_ID); error = 1; } bail_out(error); B_p += offset; if (Num_groups>1) { size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, (void *) &Work_in_p, &shm_win_Work_in); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_in); MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_in_p); if (Work_in_p == NULL){ printf(" Error allocating space for in block by group %d\n",group_ID); error = 1; } bail_out(error); /* recompute memory size (overwritten by prior query */ size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &Work_out_p, &shm_win_Work_out); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_out); MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_out_p); if (Work_out_p == NULL){ printf(" Error allocating space for out block by group %d\n",group_ID); error = 1; } bail_out(error); } /* Fill the original column matrix */ istart = 0; int chunk_size = Block_order/group_size; if (tiling) { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) { for (i=0;i<order; i+=Tile_order) for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++) for (it=i; it<MIN(order,i+Tile_order); it++) { A(it,jt) = (double) ((double)order*(jt+colstart) + it); B(it,jt) = -1.0; } } } else { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) for (i=0;i<order; i++) { A(i,j) = (double)((double)order*(j+colstart) + i); B(i,j) = -1.0; } } /* NEED A STORE FENCE HERE */ MPI_Win_sync(shm_win_A); MPI_Win_sync(shm_win_B); MPI_Barrier(shm_comm); for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) { for (j=0; j<Block_order; j++) B(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) = A(it,jt); } } } for (phase=1; phase<Num_groups; phase++){ recv_from = ((group_ID + phase )%Num_groups); send_to = ((group_ID - phase + Num_groups)%Num_groups); istart = send_to*Block_order; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); } } /* NEED A LOAD/STORE FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); if (shm_ID==0) { #ifndef SYNCHRONOUS /* if we place the Irecv outside this block, it would not be protected by a local barrier, which creates a race */ MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req); MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, MPI_COMM_WORLD, &send_req); MPI_Wait(&recv_req, &status); MPI_Wait(&send_req, &status); #else MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &status); #endif } /* NEED A LOAD FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0; i<Block_order; i++) B(i,j) = Work_in(i,j); } /* end of phase loop */ } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; /* for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0;i<order; i++) { abserr += ABS(B(i,j) - (double)((double)order*i + j+colstart)); } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr_tot); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); MPI_Win_unlock_all(shm_win_A); MPI_Win_unlock_all(shm_win_B); MPI_Win_free(&shm_win_A); MPI_Win_free(&shm_win_B); if (Num_groups>1) { MPI_Win_unlock_all(shm_win_Work_in); MPI_Win_unlock_all(shm_win_Work_out); MPI_Win_free(&shm_win_Work_in); MPI_Win_free(&shm_win_Work_out); } MPI_Info_free(&rma_winfo); MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char ** argv) { int Block_order; /* number of columns owned by rank */ int Block_size; /* size of a single block */ int Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ int order; /* order of overall matrix */ int send_to, recv_from; /* ranks with which to communicate */ MPI_Status status; #ifndef SYNCHRONOUS MPI_Request send_req; MPI_Request recv_req; #endif long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* ID of root rank */ int iterations; /* number of times to do the transpose */ int i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int nthread_input, /* thread parameters */ nthread; int error; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double *Work_in_p; /* workspace for the transpose function */ double *Work_out_p; /* workspace for the transpose function */ double abserr, /* absolute error */ abserr_tot; /* aggregate absolute error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; /********************************************************************* ** Initialize the MPI environment *********************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { if (argc != 4 && argc != 5){ printf("Usage: %s <#threads><#iterations> <matrix order> [Tile size]\n", *argv); error = 1; goto ENDOFTESTS; } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc == 5) Tile_order = atoi(*++argv); ENDOFTESTS:; } bail_out(error); /* Broadcast input data to all ranks */ MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&nthread_input, 1, MPI_INT, root, MPI_COMM_WORLD); omp_set_num_threads(nthread_input); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); if (my_ID == root) { printf("MPI+OpenMP matrix transpose: B = A^T\n"); printf("Number of ranks = %d\n", Num_procs); printf("Number of threads = %d\n", omp_get_max_threads()); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); if (tiling) { printf("Tile size = %d\n", Tile_order); #ifdef COLLAPSE printf("Using loop collapse\n"); } #endif else printf("Untiled\n"); #ifndef SYNCHRONOUS printf("Non-"); #endif printf("Blocking messages\n"); } bytes = 2.0 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); B_p = (double *)malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_in_p = (double *)malloc(2*Block_size*sizeof(double)); if (Work_in_p == NULL){ printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); Work_out_p = Work_in_p + Block_size; } /* Fill the original column matrix */ istart = 0; if (tiling) { #ifdef COLLAPSE #pragma omp parallel for private (i,it,jt) collapse(2) #else #pragma omp parallel for private (i,it,jt) #endif for (j=0; j<Block_order; j+=Tile_order) for (i=0; i<order; i+=Tile_order) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) for (it=i; it<MIN(order,i+Tile_order); it++) { A(it,jt) = (double) (order*(jt+colstart) + it); B(it,jt) = -1.0; } } else { #pragma omp parallel for private (i) for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = -1.0; } } for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { #pragma omp parallel for private (j) for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++) { B(j,i) = A(i,j); } } else { #ifdef COLLAPSE #pragma omp parallel for private (j,it,jt) collapse(2) #else #pragma omp parallel for private (j,it,jt) #endif for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) = A(it,jt); } } for (phase=1; phase<Num_procs; phase++){ recv_from = (my_ID + phase )%Num_procs; send_to = (my_ID - phase + Num_procs)%Num_procs; #ifndef SYNCHRONOUS MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, recv_from, phase, MPI_COMM_WORLD, &recv_req); #endif istart = send_to*Block_order; if (!tiling) { #pragma omp parallel for private (j) for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); } } else { #ifdef COLLAPSE #pragma omp parallel for private (j,it,jt) collapse(2) #else #pragma omp parallel for private (j,it,jt) #endif for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); } } #ifndef SYNCHRONOUS MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to, phase, MPI_COMM_WORLD, &send_req); MPI_Wait(&recv_req, &status); MPI_Wait(&send_req, &status); #else MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to, phase, Work_in_p, Block_size, MPI_DOUBLE, recv_from, phase, MPI_COMM_WORLD, &status); #endif istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ #pragma omp parallel for private (i) for (j=0; j<Block_order; j++) for (i=0; i<Block_order; i++) { B(i,j) = Work_in(i,j); } } /* end of phase loop */ } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; #pragma omp parallel for private (i) for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { abserr += ABS(B(i,j) - (double)(order*i + j+colstart)); } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr); #endif } else { printf("ERROR: Aggregate squared error %lf exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ long Block_size; /* size of a single block */ long Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ long order; /* order of overall matrix */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ int i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double RESTRICT *A_p; /* original matrix column block */ double RESTRICT *B_p; /* transposed matrix column block */ double RESTRICT *Work_in_p;/* workspace for transpose function */ double RESTRICT *Work_out_p;/* workspace for transpose function */ double abserr, /* absolute error */ abserr_tot; /* aggregate absolute error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Win rma_win = MPI_WIN_NULL; MPI_Info rma_winfo = MPI_INFO_NULL; int passive_target = 0; /* use passive target RMA sync */ #if MPI_VERSION >= 3 int flush_local = 1; /* flush local (or remote) after put */ int flush_bundle = 1; /* flush every <bundle> put calls */ #endif /********************************************************************* ** Initialize the MPI environment *********************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPIRMA matrix transpose: B = A^T\n"); if (argc <= 3){ printf("Usage: %s <# iterations> <matrix order> [Tile size]" "[sync (0=fence, 1=flush)] [flush local?] [flush bundle]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atol(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %ld should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %ld should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc >= 4) Tile_order = atoi(*++argv); if (argc >= 5) passive_target = atoi(*++argv); #if MPI_VERSION >= 3 if (argc >= 6) flush_local = atoi(*++argv); if (argc >= 7) flush_bundle = atoi(*++argv); #endif ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %ld\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); if (passive_target) { #if MPI_VERSION < 3 printf("Synchronization = MPI_Win_(un)lock\n"); #else printf("Synchronization = MPI_Win_flush%s (bundle=%d)\n", flush_local ? "_local" : "", flush_bundle); #endif } else { printf("Synchronization = MPI_Win_fence\n"); } } /* Broadcast input data to all ranks */ MPI_Bcast (&order, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast (&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&passive_target, 1, MPI_INT, root, MPI_COMM_WORLD); #if MPI_VERSION >= 3 MPI_Bcast (&flush_local, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&flush_bundle, 1, MPI_INT, root, MPI_COMM_WORLD); #endif /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /* debug message size effects */ if (my_ID == root) { printf("Block_size = %ld\n", Block_size); } /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); MPI_Info_create (&rma_winfo); MPI_Info_set (rma_winfo, "no locks", "true"); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_out_p = (double *) prk_malloc(Block_size*(Num_procs-1)*sizeof(double)); if (Work_out_p == NULL){ printf(" Error allocating space for work_out on node %d\n",my_ID); error = 1; } bail_out(error); PRK_Win_allocate(Block_size*(Num_procs-1)*sizeof(double), sizeof(double), rma_winfo, MPI_COMM_WORLD, &Work_in_p, &rma_win); if (Work_in_p == NULL){ printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } #if MPI_VERSION >= 3 if (passive_target && Num_procs>1) { MPI_Win_lock_all(MPI_MODE_NOCHECK,rma_win); } #endif /* Fill the original column matrix */ istart = 0; for (j=0;j<Block_order;j++) { for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } } MPI_Barrier(MPI_COMM_WORLD); for (iter = 0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) { for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } } else { for (i=0; i<Block_order; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) { for (it=i; it<MIN(Block_order,i+Tile_order); it++) { for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } } } } if (!passive_target && Num_procs>1) { MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, rma_win); } for (phase=1; phase<Num_procs; phase++){ send_to = (my_ID - phase + Num_procs)%Num_procs; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) { for (j=0; j<Block_order; j++) { Work_out(phase-1,j,i) = A(i,j); A(i,j) += 1.0; } } } else { for (i=0; i<Block_order; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) { for (it=i; it<MIN(Block_order,i+Tile_order); it++) { for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(phase-1,jt,it) = A(it,jt); A(it,jt) += 1.0; } } } } } #if MPI_VERSION < 3 if (passive_target) { MPI_Win_lock(MPI_LOCK_SHARED, send_to, MPI_MODE_NOCHECK, rma_win); } #endif MPI_Put(Work_out_p+Block_size*(phase-1), Block_size, MPI_DOUBLE, send_to, Block_size*(phase-1), Block_size, MPI_DOUBLE, rma_win); if (passive_target) { #if MPI_VERSION < 3 MPI_Win_unlock(send_to, rma_win); #else if (flush_bundle==1) { if (flush_local==1) { MPI_Win_flush_local(send_to, rma_win); } else { MPI_Win_flush(send_to, rma_win); } } else if ( (phase%flush_bundle) == 0) { /* Too lazy to record all targets, so let MPI do it internally (hopefully) */ if (flush_local==1) { MPI_Win_flush_local_all(rma_win); } else { MPI_Win_flush_all(rma_win); } } #endif } } /* end of phase loop for puts */ if (Num_procs>1) { if (passive_target) { #if MPI_VERSION >= 3 MPI_Win_flush_all(rma_win); #endif MPI_Barrier(MPI_COMM_WORLD); } else { MPI_Win_fence(MPI_MODE_NOSTORE, rma_win); } } for (phase=1; phase<Num_procs; phase++) { recv_from = (my_ID + phase)%Num_procs; istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) { for (i=0; i<Block_order; i++) { B(i,j) += Work_in(phase-1,i,j); } } } /* end of phase loop for scatters */ /* for the flush case we need to make sure we have consumed Work_in before overwriting it in the next iteration */ if (Num_procs>1 && passive_target) { MPI_Barrier(MPI_COMM_WORLD); } } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) { for (i=0;i<order; i++) { abserr += ABS(B(i,j) - ((double)(order*i + j+colstart)*(iterations+1)+addit)); } } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); } else { printf("ERROR: Aggregate absolute error %lf exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); if (rma_win!=MPI_WIN_NULL) { #if MPI_VERSION >=3 if (passive_target) { MPI_Win_unlock_all(rma_win); } #endif PRK_Win_free(&rma_win); } MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ int Block_size; /* size of a single block */ int Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ int order; /* order of overall matrix */ int bufferCount; /* number of input buffers */ int targetBuffer; /* buffer with which to communicate */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ long i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double **Work_in_p; /* workspace for the transpose function */ double *Work_out_p; /* workspace for the transpose function */ double epsilon = 1.e-8; /* error tolerance */ double avgtime; /* timing parameters */ long *pSync_bcast; /* work space for collectives */ long *pSync_reduce; /* work space for collectives */ double *pWrk; /* work space for SHMEM collectives */ double *local_trans_time, *trans_time; /* timing parameters */ double *abserr, *abserr_tot; /* local and aggregate error */ int *send_flag, *recv_flag; /* synchronization flags */ int *arguments; /* command line arguments */ /********************************************************************* ** Initialize the SHMEM environment *********************************************************************/ prk_shmem_init(); my_ID=prk_shmem_my_pe(); Num_procs=prk_shmem_n_pes(); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("SHMEM matrix transpose: B = A^T\n"); } // initialize sync variables for error checks pSync_bcast = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long)); pSync_reduce = (long *) prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long)); pWrk = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE); local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double)); arguments = (int *) prk_shmem_align(prk_get_alignment(),4*sizeof(int)); abserr = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double)); abserr_tot = abserr + 1; if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time || !trans_time || !arguments || !abserr) { printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID); error = 1; goto ENDOFTESTS; } for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++) pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE; for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++) pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE; /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { if (argc != 4 && argc != 5){ printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); arguments[0]=iterations; if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); arguments[1]=order; if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } bufferCount = atoi(*++argv); arguments[2]=bufferCount; if (Num_procs > 1) { if ((bufferCount < 1) || (bufferCount >= Num_procs)) { printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount); error = 1; goto ENDOFTESTS; } } if (argc == 5) Tile_order = atoi(*++argv); arguments[3]=Tile_order; ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); printf("Number of buffers = %d\n", bufferCount); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); } shmem_barrier_all(); /* Broadcast input data to all ranks */ shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast); iterations=arguments[0]; order=arguments[1]; bufferCount=arguments[2]; Tile_order=arguments[3]; shmem_barrier_all(); prk_shmem_free(arguments); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_in_p = (double**)prk_malloc(bufferCount*sizeof(double)); Work_out_p = (double *) prk_malloc(Block_size*sizeof(double)); recv_flag = (int*) prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int)); if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){ printf(" Error allocating space for work or flags on node %d\n",my_ID); error = 1; } if (bufferCount < (Num_procs - 1)) { send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int)); if (send_flag == NULL) { printf("Error allocating space for flags on node %d\n", my_ID); error = 1; } } bail_out(error); for(i=0;i<bufferCount;i++) { Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double)); if (Work_in_p[i] == NULL) { printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } if (bufferCount < (Num_procs - 1)) { for(i=0;i<(Num_procs-1);i++) send_flag[i]=0; } for(i=0;i<bufferCount;i++) recv_flag[i]=0; } /* Fill the original column matrices */ istart = 0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } shmem_barrier_all(); if (bufferCount < (Num_procs - 1)) { if (Num_procs > 1) { for ( i = 0; i < bufferCount; i++) { recv_from = (my_ID + i + 1)%Num_procs; shmem_int_inc(&send_flag[i], recv_from); } } } shmem_barrier_all(); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { shmem_barrier_all(); local_trans_time[0] = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } for (phase=1; phase<Num_procs; phase++){ recv_from = (my_ID + phase )%Num_procs; send_to = (my_ID - phase + Num_procs)%Num_procs; targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); A(i,j) += 1.0; } } else { for (i=0; i<Block_order; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); A(it,jt) += 1.0; } } if (bufferCount < (Num_procs - 1)) shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1); shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to); shmem_fence(); shmem_int_inc(&recv_flag[targetBuffer], send_to); i = (iter * (Num_procs - 1) + phase) / bufferCount; if ((iter * (Num_procs - 1) + phase) % bufferCount) i++; shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) for (i=0; i<Block_order; i++) B(i,j) += Work_in(targetBuffer, i,j); if (bufferCount < (Num_procs - 1)) { if ((phase + bufferCount) < Num_procs) recv_from = (my_ID + phase + bufferCount) % Num_procs; else recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs; shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from); } } /* end of phase loop */ } /* end of iterations */ local_trans_time[0] = wtime() - local_trans_time[0]; shmem_barrier_all(); shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce); abserr[0] = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) for (i=0;i<order; i++) { abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit)); } shmem_barrier_all(); shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce); if (my_ID == root) { if (abserr_tot[0] <= epsilon) { printf("Solution validates\n"); avgtime = trans_time[0]/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr[0]); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon); error = 1; } } bail_out(error); if (Num_procs>1) { if (bufferCount < (Num_procs - 1)) prk_shmem_free(send_flag); prk_shmem_free(recv_flag); prk_free(Work_out_p); for(i=0;i<bufferCount;i++) prk_shmem_free(Work_in_p[i]); prk_free(Work_in_p); } prk_shmem_free(pSync_bcast); prk_shmem_free(pSync_reduce); prk_shmem_free(pWrk); prk_shmem_finalize(); exit(EXIT_SUCCESS); } /* end of main */