int main(int argc, char** argv) { int rank; assertz("MPI_Init", MPI_Init(&argc, &argv)); assertz("MPI_Comm_rank", MPI_Comm_rank(MPI_COMM_WORLD, &rank)); const int data_size = 2; int data_replace[data_size]; const int tag_1 = 12345; const int tag_2 = 67890; const int jack = 0; const int jill = 1; MPI_Status stat; if (rank == jack) { data_replace[0] = 11; data_replace[1] = 12; MPI_Sendrecv_replace(&data_replace, data_size, MPI_INT, jill, tag_1, jill, tag_2, MPI_COMM_WORLD, &stat); } if (rank == jill) { data_replace[0] = 21; data_replace[1] = 22; MPI_Sendrecv_replace(&data_replace, data_size, MPI_INT, jack, tag_2, jack, tag_1, MPI_COMM_WORLD, &stat); } if (rank < 2) { printf("rank = %d : data_replace[] = {%d, %d} \n", rank, data_replace[0], data_replace[1]); } assertz("MPI_Finalize", MPI_Finalize()); exit(EXIT_SUCCESS); }
int main(int argc, char *argv[]) { int *matrix_a; int *matrix_b; int *matrix_c; const char *matrix_a_filename = argv[1]; const char *matrix_b_filename = argv[2]; const char *matrix_c_filename = argv[3]; MPI_Comm matrix_comm; MPI_Init(&argc, &argv); create_matrix_comm(MPI_COMM_WORLD, &matrix_comm); MPI_Comm_size(matrix_comm, &size); MPI_Comm_rank(matrix_comm, &rank); compute_matrixes_variables(matrix_a_filename, matrix_comm); alloc_submatrix_buffer(&matrix_a); alloc_submatrix_buffer(&matrix_b); alloc_submatrix_buffer(&matrix_c); distribute_matrix(matrix_a_filename, matrix_a, matrix_comm); distribute_matrix(matrix_b_filename, matrix_b, matrix_comm); /* The actual cannon algorithms */ int row_source, row_dst; int col_source, col_dst; MPI_Cart_shift(matrix_comm, 0, -1, &row_source, &row_dst); MPI_Cart_shift(matrix_comm, 1, -1, &col_source, &col_dst); int i; for (i = 0; i < pp_dims; i++) { compute_matrix_mul(matrix_a, matrix_b, matrix_c, N); MPI_Sendrecv_replace(matrix_a, sub_n * sub_n, MPI_INT, row_source, MPI_ANY_TAG, row_dst, MPI_ANY_TAG, matrix_comm, MPI_STATUS_IGNORE); MPI_Sendrecv_replace(matrix_b, sub_n * sub_n, MPI_INT, col_source, MPI_ANY_TAG, col_dst, MPI_ANY_TAG, matrix_comm, MPI_STATUS_IGNORE); } write_result(matrix_c_filename, matrix_c, matrix_comm); free(matrix_a); free(matrix_b); free(matrix_c); MPI_Comm_free(&matrix_comm); MPI_Finalize(); return 0; }
void MatrixMatrixMultiply(double ***a, double ***b, double ***c, int mra, int mca, int mrb, int mcb, int *ra, int *ca, int *rb, int *cb, MPI_Comm comm) { /*from the teaching book */ int i, j; int num_procs, dims[2], periods[2]; int myrank, my2drank, mycoords[2]; int uprank, downrank, leftrank, rightrank, coords[2]; int shiftsource, shiftdest; MPI_Status status; MPI_Comm comm_2d; MPI_Comm_size(comm, &num_procs); MPI_Comm_rank(comm_2d, &my2drank); dims[0] = dims[1] = 0; MPI_Dims_create(num_procs, 2, dims); periods[0]= periods[1] = 1; MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &my2drank); MPI_Cart_coords(comm_2d, my2drank, 2, mycoords); MPI_Cart_shift(comm_2d, 1, -1, &rightrank, &leftrank); MPI_Cart_shift(comm_2d, 0, -1, &downrank, &uprank); int ia = my2drank; int ib = my2drank; MPI_Cart_shift(comm_2d, 1, -mycoords[0], &shiftsource, &shiftdest); MPI_Sendrecv_replace((*a)[0], mra*mca, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status); MPI_Sendrecv_replace(&ia, 1, MPI_INT, shiftdest, 1, shiftsource, 1, comm_2d, &status); MPI_Cart_shift(comm_2d, 0, -mycoords[1], &shiftsource, &shiftdest); MPI_Sendrecv_replace((*b)[0], mrb*mcb, MPI_DOUBLE, shiftdest, 1, shiftsource, 1, comm_2d, &status); MPI_Sendrecv_replace(&ib, 1, MPI_INT, shiftdest, 1, shiftsource, 1, comm_2d, &status); for (i=0; i<dims[0]; i++){ MatrixMultiply(ra[ia], ca[ia], rb[ib], cb[ib], *a, *b, c); /* c=c + a*b */ MPI_Sendrecv_replace((*a)[0], mra*mca, MPI_DOUBLE, leftrank, 1, rightrank, 1, comm_2d, &status); MPI_Sendrecv_replace((*b)[0], mrb*mcb, MPI_DOUBLE, uprank, 1, downrank, 1, comm_2d, &status); MPI_Sendrecv_replace(&ia, 1, MPI_INT, leftrank, 1, rightrank, 1, comm_2d, &status); MPI_Sendrecv_replace(&ib, 1, MPI_INT, uprank, 1, downrank, 1, comm_2d, &status); } MPI_Comm_free(&comm_2d); }
/* algoritmo de Fox para multiplicar matrices cuadradas de n x n */ void Fox(struct grid_info *grid, int local_n, matrix_type** local_A, matrix_type** local_B, matrix_type** local_C) { int stage; int i, j; const int local_n_sq = local_n * local_n; const int src = (grid->my_row + 1) % grid->ppside; const int dest = (grid->my_row + grid->ppside - 1) % grid->ppside; int bcast_root; MPI_Status status; matrix_type **temp_A = matrix_new(local_n, local_n); for (stage = 0; stage < grid->ppside; ++stage) { bcast_root = (grid->my_row + stage) % grid->ppside; if (bcast_root == grid->my_col) { MPI_Bcast(*local_A, local_n * local_n, MPI_FLOAT, bcast_root, grid->row_comm); matrix_multiply_and_add(local_A, local_B, local_C, local_n, local_n, local_n); } else { MPI_Bcast(*temp_A, local_n_sq, MPI_FLOAT, bcast_root, grid->row_comm); matrix_multiply_and_add(temp_A, local_B, local_C, local_n, local_n, local_n); } MPI_Sendrecv_replace(*local_B, local_n_sq, MPI_FLOAT, dest, 0, src, 0, grid->col_comm, &status); } }
void mpi_manager_2D::do_MPISendRecv(NumMatrix<double,2> &buff, int Destination) { //! Do a send-receive operation, where the send-buffer is overwritten /*! Origin and destination is the same in this case */ // Get size of buffer: int size = ((buff.getHigh(1) - buff.getLow(1) + 1)* (buff.getHigh(0) - buff.getLow(0) + 1)); // MPI_Request request[1] = {MPI_REQUEST_NULL}; // MPI_Request request; MPI_Status status; // int tag = rank; int SendTag = rank; int RecvTag = Destination; // int SendTag = rank + Destination; // int RecvTag = Destination + rank; // Now do the communication: MPI_Sendrecv_replace((double *) buff, size, MPI_DOUBLE, Destination, SendTag, Destination, RecvTag, comm2d, &status); // MPI_Waitall(1, request); }
int main(int argc, char** argv) { MPI_Init(&argc, &argv); int size, rank, i, dest, source, sum, temp_val; MPI_Status* status; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); dest = (rank + 1) % size; source = (rank - 1) % size; sum = temp_val = rank; for(i = 1; i < size; i++){ MPI_Sendrecv_replace(&temp_val, 1, MPI_INT, dest, 0, source, 0, MPI_COMM_WORLD, status); sum += temp_val ; } printf("Process %d: %d\n", rank, sum); MPI_Finalize(); return EXIT_SUCCESS; }
double* multiply_matrix_by_vector(double* matrix, double* vector) { int vector_length = c_recvcounts[rank]; int offset = c_displs[rank]; int incoming_process_data = 0; double* result = (double*) calloc(vector_length, sizeof(double)); double* v = (double*) calloc(N/size + 1, sizeof(double)); for (int i = 0; i < vector_length; i++) { v[i] = vector[i]; } for (int process = 0; process < size; process++) { // index of current part of vector incoming_process_data = (rank + process) % size; for (int i = 0; i < c_recvcounts[rank]; i++) { for (int j = 0; j < c_recvcounts[incoming_process_data]; j++) { result[i] += matrix[i * N + j + c_displs[incoming_process_data]] * v[j]; } } // switch vector, (vector length, vector offset) between processes MPI_Sendrecv_replace(v, N/size + 1, MPI_DOUBLE, (rank+1) % size, TAG_SEND_MATRIX, (rank-1) % size, TAG_SEND_MATRIX, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } return result; }
int MPIX_Sendrecv_replace_x(void *buf, MPI_Count count, MPI_Datatype datatype, int dest, int sendtag, int source, int recvtag, MPI_Comm comm, MPI_Status *status) { int rc = MPI_SUCCESS; if (likely (count <= bigmpi_int_max )) { rc = MPI_Sendrecv_replace(buf, (int)count, datatype, dest, sendtag, source, recvtag, comm, status); } else { MPI_Datatype newtype; BigMPI_Type_contiguous(0,count, datatype, &newtype); MPI_Type_commit(&newtype); rc = MPI_Sendrecv_replace(buf, 1, newtype, dest, sendtag, source, recvtag, comm, status); MPI_Type_free(&newtype); } return rc; }
void mpi_manager_2D::do_MPISendRecv(NumMatrix<double,2> &buff, int Source, int Destination) { //! Do a send-receive operation, where the send-buffer is overwritten /*! Get data from somewhere and send own data somewhere else. The original data will be overwritten */ // Get size of buffer: int size = ((buff.getHigh(1) - buff.getLow(1) + 1)* (buff.getHigh(0) - buff.getLow(0) + 1)); // MPI_Request request[1] = {MPI_REQUEST_NULL}; // MPI_Request request; MPI_Status status; // int tag = rank; int SendTag = rank; int RecvTag = Source; // int SendTag = rank + Destination; // int RecvTag = Destination + rank; // Now do the communication: MPI_Sendrecv_replace((double *) buff, size, MPI_DOUBLE, Destination, SendTag, Source, RecvTag, comm2d, &status); // MPI_Waitall(1, request); }
FORT_DLL_SPEC void FORT_CALL mpi_sendrecv_replace_ ( void*v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *ierr ){ #ifndef HAVE_MPI_F_INIT_WORKS_WITH_C if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; } #endif if (v9 == MPI_F_STATUS_IGNORE) { v9 = (MPI_Fint*)MPI_STATUS_IGNORE; } *ierr = MPI_Sendrecv_replace( v1, *v2, (MPI_Datatype)(*v3), *v4, *v5, *v6, *v7, (MPI_Comm)(*v8), (MPI_Status *)v9 ); }
/*-------------------------------------------------------------------------------*/ void OneStepCirculation(int step) { MPI_Status status; MPI_Sendrecv_replace(A_Slice, SIZE * LOCAL_SIZE, MPI_DOUBLE, ((Me - 1) + NbPE) % NbPE, 0, (Me + 1) % NbPE , 0, MPI_COMM_WORLD, &status); /******************************** TO DO ******************************************/ }
void prod_matrix(int N, int Nb, int myrank, double* bl_a, double* bl_b, double* bl_c, MPI_Comm comm_grid, MPI_Comm comm_col, MPI_Comm comm_row) { int k; double my_a[Nb*Nb]; int coords[2]; MPI_Status st; int gd = N/Nb; for (int i = 0; i < Nb*Nb; ++i) { my_a[i] = bl_a[i]; } MPI_Cart_coords(comm_grid, myrank, 2, coords); int sndto = (((coords[0]-1)%gd) +gd) %gd; int recvfrom = (coords[0]+1)%gd; int myrow = coords[0]; int mycol = coords[1]; for (k = 0; k < gd; k++) { /* If I am i+k%N proc of the line * Bcast_line(A[i][i+k%N]) * Else * recv(A) from the i+k%N proc of the line */ if(mycol == (myrow+k)%gd) { for (int i = 0; i < Nb*Nb; ++i) { bl_a[i] = my_a[i]; } } MPI_Bcast(bl_a, Nb*Nb, MPI_DOUBLE, (myrow+k)%gd, comm_row); cblas_dgemm_scalaire(Nb, bl_a, bl_b, bl_c); //Cij = A[i][i+k%N]*B[i+k%N][j] /* send(B) to upper neighbour */ MPI_Sendrecv_replace(bl_b, Nb*Nb, MPI_DOUBLE, sndto, 0, recvfrom, 0, comm_col, &st); } }
void Fox( int n /* in */, GRID_INFO_T* grid /* in */, LOCAL_MATRIX_T* local_A /* in */, LOCAL_MATRIX_T* local_B /* in */, LOCAL_MATRIX_T* local_C /* out */) { LOCAL_MATRIX_T* temp_A; /* Storage for the sub- */ /* matrix of A used during */ /* the current stage */ int stage; int bcast_root; int n_bar; /* n/sqrt(p) */ int source; int dest; MPI_Status status; n_bar = n/grid->q; Set_to_zero(local_C); /* Calculate addresses for circular shift of B */ source = (grid->my_row + 1) % grid->q; dest = (grid->my_row + grid->q - 1) % grid->q; /* Set aside storage for the broadcast block of A */ temp_A = Local_matrix_allocate(n_bar); for (stage = 0; stage < grid->q; stage++) { bcast_root = (grid->my_row + stage) % grid->q; if (bcast_root == grid->my_col) { MPI_Bcast(local_A, 1, local_matrix_mpi_t, bcast_root, grid->row_comm); Local_matrix_multiply(local_A, local_B, local_C); } else { MPI_Bcast(temp_A, 1, local_matrix_mpi_t, bcast_root, grid->row_comm); Local_matrix_multiply(temp_A, local_B, local_C); } MPI_Sendrecv_replace(local_B, 1, local_matrix_mpi_t, dest, 0, source, 0, grid->col_comm, &status); } /* for */ } /* Fox */
void switchbuff(float *buff, int neighbor, int ndata) { MPI_Status status; // static int tag=0; int tag=0; int mpi_rank; FILE *ftest; char fnt[256]; int dum=1; MPI_Barrier(MPI_COMM_WORLD); MPI_Sendrecv_replace(buff,ndata,MPI_FLOAT,neighbor,tag,neighbor,tag,MPI_COMM_WORLD,&status); //tag++; }
int main (int argc, char **argv) { // initialize MPI MPI_Init (&argc, &argv); // we have to remember the number of PEs int numpes; MPI_Comm_size (MPI_COMM_WORLD, &numpes); //for this we need 2 PEs assert(numpes == 2); // which rank does this process have? int myid; MPI_Comm_rank (MPI_COMM_WORLD, &myid); // deadlock avoidance: PE 0 sends and recieves using the same function call, PE 1 uses // its own buffer to avoid blocking on send. if (myid == 0) { // send message to 1, wait for message from 1 char buf[10000]; MPI_Status stat; MPI_Sendrecv_replace (buf, 10000, MPI_CHAR, 1, 0, 1, 0,MPI_COMM_WORLD, &stat); printf ("0: done\n"); } else { // send message to 0, wait for message from 0 char buf[10000]; char intermediate_buffer[10000 + MPI_BSEND_OVERHEAD]; MPI_Buffer_attach (&intermediate_buffer, 10000 + MPI_BSEND_OVERHEAD); MPI_Status stat; MPI_Bsend (buf, 10000, MPI_CHAR, 0, 0, MPI_COMM_WORLD); // we can use buf again, as intermediate_buffer will take care of buffering MPI_Recv (buf, 10000, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &stat); printf ("1: done\n"); } MPI_Finalize (); return EXIT_SUCCESS; }
void mpi_sendrecv_replace_f(char *buf, MPI_Fint *count, MPI_Fint *datatype, MPI_Fint *dest, MPI_Fint *sendtag, MPI_Fint *source, MPI_Fint *recvtag, MPI_Fint *comm, MPI_Fint *status, MPI_Fint *ierr) { MPI_Datatype c_type = MPI_Type_f2c(*datatype); MPI_Comm c_comm; MPI_Status c_status; c_comm = MPI_Comm_f2c (*comm); *ierr = OMPI_INT_2_FINT(MPI_Sendrecv_replace(OMPI_F2C_BOTTOM(buf), OMPI_FINT_2_INT(*count), c_type, OMPI_FINT_2_INT(*dest), OMPI_FINT_2_INT(*sendtag), OMPI_FINT_2_INT(*source), OMPI_FINT_2_INT(*recvtag), c_comm, &c_status)); if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr) && !OMPI_IS_FORTRAN_STATUS_IGNORE(status)) { MPI_Status_c2f(&c_status, status); } }
main(int argc, char * argv[]) { srand(time(NULL)); int my_rank, size, stage, temp; int n, local_n, i, j, k, source, dest, q, ind; float *matrix_A; float *matrix_B; float *matrix_C; float *local_A; float *local_B; float *local_C; double start, end; MPI_Datatype column; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (my_rank == 0) { printf("\t\t****************************************************\n"); printf("\t\t* Descomposición en Bloques de Rayas por Filas *\n"); printf("\t\t****************************************************\n\n"); } if (my_rank == 0) { matrix_A = (float *)malloc(MAX*MAX*sizeof(float)); matrix_B = (float *)malloc(MAX*MAX*sizeof(float)); matrix_C = (float *)malloc(MAX*MAX*sizeof(float)); if (argc == 2) { sscanf(argv[1], "%d", &n); } else if (my_rank == 0) { printf("¿ Cuál es el orden de las matrices ? \n"); scanf("%d", &n); } local_n = n / size; /* Se lee la matriz A */ Read_matrix ("Ingrese A :", matrix_A, n); Print_matrix ("Se leyó A :", matrix_A, n); /* Se lee la matriz B */ Read_matrix ("Ingrese B :", matrix_B, n); Print_matrix ("Se leyó B :", matrix_B, n); } MPI_Bcast(&local_n, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); local_A = (float *) malloc(MAX*MAX*sizeof(float)); local_B = (float *) malloc(MAX*MAX*sizeof(float)); local_C = (float *) malloc(MAX*MAX*sizeof(float)); /******************************************************************************/ /* Distribuir los bloques de filas de A y los bloques de filas de B entre los procesadores del comunicador global */ // Enviar los bloques de filas de A y B a todos los procesadores MPI_Scatter(matrix_A, local_n*n, MPI_FLOAT, local_A, local_n*n, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Scatter(matrix_B, local_n*n, MPI_FLOAT, local_B, local_n*n, MPI_FLOAT, 0, MPI_COMM_WORLD); /*****************************************************************************/ /* Algoritmo de Descomposición por Bloques de Rayas por filas para la matriz A y por columnas para la matriz B */ source = (my_rank - 1 + size) % size; dest = (my_rank + 1) % size; q = n / local_n; start = MPI_Wtime(); for (stage = 0; stage < q; stage++) { ind = (my_rank - stage + size) % size; for (j = 0; j < local_n; j++) { for (i = 0; i < n; i++) { for (k = 0; k < local_n; k++) { local_C[i + j*n] += local_A[local_n*ind + k + j*n] * local_B[i + k*n]; } } } MPI_Sendrecv_replace(local_B, local_n*n, MPI_FLOAT, dest, 0, source, 0, MPI_COMM_WORLD, &status); } end = MPI_Wtime(); /*****************************************************************************/ // Recolectar los bloques local_C de cada procesador en matrix_C en le procesador 0 MPI_Gather(local_C, local_n*n, MPI_FLOAT, matrix_C, local_n*n, MPI_FLOAT, 0, MPI_COMM_WORLD); if (my_rank == 0) { Print_matrix ("El producto es C : ", matrix_C, n); } if (my_rank == 0) printf("n : %d\nDesc. por filas : %f segundos.\n", n, end - start); MPI_Finalize(); } /* main */
void __entry stencil_thread( void* p ) { my_args_t* pargs = (my_args_t*)p; int i,j; int NI = pargs->ni; int NJ = pargs->nj; int di = pargs->di; int dj = pargs->dj; int niter = pargs->niter; float* A = pargs->A; float* B = pargs->B; float w0 = pargs->w0; float w1 = pargs->w1; float w2 = pargs->w2; float w3 = pargs->w3; float w4 = pargs->w4; int myrank_2d, mycoords[2]; int dims[2] = {di, dj}; int periods[2] = {1, 1}; // Periodic communication but ignoring edge copy where irrelvant MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm comm_2d; MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &myrank_2d); MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords); int x = mycoords[0]; int y = mycoords[1]; // ranks of neighbors int north, south, west, east; MPI_Cart_shift(comm_2d, 0, 1, &west, &east); MPI_Cart_shift(comm_2d, 1, 1, &north, &south); // local stencil sizes with padding int ni = (NI-2) / di + 2; int nj = (NJ-2) / dj + 2; // Load the initial values void* memfree = coprthr_tls_sbrk(0); float* a = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float)); float* b = (float*)coprthr_tls_sbrk(ni*nj*sizeof(float)); float* nsbuf = (float*)coprthr_tls_sbrk(ni*sizeof(float)); float* webuf = (float*)coprthr_tls_sbrk((nj-2)*sizeof(float)); long long* srcadr; long long* dstadr; long long* nsend = (long long*)(nsbuf + ni); // Copy initial conditions (2D DMA would be better) for (j=0; j<nj; j++) e_dma_copy(a+j*ni, A + (y*(ni-2)+j)*NI+x*(nj-2), ni*sizeof(float)); // Initial conditions // if(y==0) for (i=0; i<ni-2; i++) a[i] = -2.0f; // if(y==dj) for (i=0; i<ni-2; i++) a[(nj-1)*ni+i] = 1.0f; // if(x==di) for (j=0; j<nj-2; j++) a[(j+2)*ni-1] = -1.0f; // if(x==0) for (j=0; j<nj-2; j++) a[(j+1)*ni] = 2.0f; // Copy "a" into "b" (only need fixed borders would be better) for (i=0; i<ni*nj; i++) b[i] = a[i]; while (niter--) { /* for (j=1; j<nj-1; j++) { for (i=1; i<ni-1; i++) { b[j*ni+i] = w0*a[j*ni+i-1] + w1*a[j*ni+i] + w2*a[j*ni+i+1] + w3*a[j*ni+i-ni] + w4*a[j*ni+i+ni]; } }*/ for (j=0; j<nj-2; j+=4) { float a14 = a[(j+1)*ni+0]; float a15 = a[(j+1)*ni+1]; float a24 = a[(j+2)*ni+0]; float a25 = a[(j+2)*ni+1]; float a34 = a[(j+3)*ni+0]; float a35 = a[(j+3)*ni+1]; float a44 = a[(j+4)*ni+0]; float a45 = a[(j+4)*ni+1]; for (i=0; i<ni-2; i+=4) { float a01 = a[(j+0)*ni+i+1]; float a02 = a[(j+0)*ni+i+2]; float a03 = a[(j+0)*ni+i+3]; float a04 = a[(j+0)*ni+i+4]; float a10 = a14; float a11 = a15; float a12 = a[(j+1)*ni+i+2]; float a13 = a[(j+1)*ni+i+3]; a14 = a[(j+1)*ni+i+4]; a15 = a[(j+1)*ni+i+5]; float a20 = a24; float a21 = a25; float a22 = a[(j+2)*ni+i+2]; float a23 = a[(j+2)*ni+i+3]; a24 = a[(j+2)*ni+i+4]; a25 = a[(j+2)*ni+i+5]; float a30 = a34; float a31 = a35; float a32 = a[(j+3)*ni+i+2]; float a33 = a[(j+3)*ni+i+3]; a34 = a[(j+3)*ni+i+4]; a35 = a[(j+3)*ni+i+5]; float a40 = a44; float a41 = a45; float a42 = a[(j+4)*ni+i+2]; float a43 = a[(j+4)*ni+i+3]; a44 = a[(j+4)*ni+i+4]; a45 = a[(j+4)*ni+i+5]; float a51 = a[(j+5)*ni+i+1]; float a52 = a[(j+5)*ni+i+2]; float a53 = a[(j+5)*ni+i+3]; float a54 = a[(j+5)*ni+i+4]; b[(j+1)*ni+i+1] = fma(w4,a21,fma(w3,a01,fma(w2,a12,fma(w1,a11,w0*a10)))); b[(j+1)*ni+i+2] = fma(w4,a22,fma(w3,a02,fma(w2,a13,fma(w1,a12,w0*a11)))); b[(j+1)*ni+i+3] = fma(w4,a23,fma(w3,a03,fma(w2,a14,fma(w1,a13,w0*a12)))); b[(j+1)*ni+i+4] = fma(w4,a24,fma(w3,a04,fma(w2,a15,fma(w1,a14,w0*a13)))); b[(j+2)*ni+i+1] = fma(w4,a31,fma(w3,a11,fma(w2,a22,fma(w1,a21,w0*a20)))); b[(j+2)*ni+i+2] = fma(w4,a32,fma(w3,a12,fma(w2,a23,fma(w1,a22,w0*a21)))); b[(j+2)*ni+i+3] = fma(w4,a33,fma(w3,a13,fma(w2,a24,fma(w1,a23,w0*a22)))); b[(j+2)*ni+i+4] = fma(w4,a34,fma(w3,a14,fma(w2,a25,fma(w1,a24,w0*a23)))); b[(j+3)*ni+i+1] = fma(w4,a41,fma(w3,a21,fma(w2,a32,fma(w1,a31,w0*a30)))); b[(j+3)*ni+i+2] = fma(w4,a42,fma(w3,a22,fma(w2,a33,fma(w1,a32,w0*a31)))); b[(j+3)*ni+i+3] = fma(w4,a43,fma(w3,a23,fma(w2,a34,fma(w1,a33,w0*a32)))); b[(j+3)*ni+i+4] = fma(w4,a44,fma(w3,a24,fma(w2,a35,fma(w1,a34,w0*a33)))); b[(j+4)*ni+i+1] = fma(w4,a51,fma(w3,a31,fma(w2,a42,fma(w1,a41,w0*a40)))); b[(j+4)*ni+i+2] = fma(w4,a52,fma(w3,a32,fma(w2,a43,fma(w1,a42,w0*a41)))); b[(j+4)*ni+i+3] = fma(w4,a53,fma(w3,a33,fma(w2,a44,fma(w1,a43,w0*a42)))); b[(j+4)*ni+i+4] = fma(w4,a54,fma(w3,a34,fma(w2,a45,fma(w1,a44,w0*a43)))); } } // north/south dstadr = (long long*)nsbuf; srcadr = (long long*)(b+ni); while (dstadr != nsend) *dstadr++ = *srcadr++; // second row MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, north, 1, south, 1, comm, &status); if (y!=dj-1) { dstadr = (long long*)(b+(nj-1)*ni); srcadr = (long long*)nsbuf; while (srcadr != nsend) *dstadr++ = *srcadr++; // last row } dstadr = (long long*)nsbuf; srcadr = (long long*)(b+(nj-2)*ni); while (dstadr != nsend) *dstadr++ = *srcadr++; // second to last row MPI_Sendrecv_replace(nsbuf, ni, MPI_FLOAT, south, 1, north, 1, comm, &status); if (y) { dstadr = (long long*)b; srcadr = (long long*)nsbuf; while (srcadr != nsend) *dstadr++ = *srcadr++; // first row } // west/east for (j=0; j<nj-2; j++) webuf[j] = b[(j+1)*ni+1]; // second column MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, west, 1, east, 1, comm, &status); if (x!=di-1) for (j=0; j<nj-2; j++) b[(j+2)*ni-1] = webuf[j]; // last column for (j=0; j<nj-2; j++) webuf[j] = b[(j+2)*ni-2]; // second to last column MPI_Sendrecv_replace(webuf, nj-2, MPI_FLOAT, east, 1, west, 1, comm, &status); if (x) for (j=0; j<nj-2; j++) b[(j+1)*ni] = webuf[j]; // first column float* tmp = b; b = a; a = tmp; } // Copy internal results for (j=1; j<nj-1; j++) e_dma_copy(B + (y*(ni-2)+j)*NI+x*(nj-2)+1, a+j*ni+1, (ni-2)*sizeof(float)); coprthr_tls_brk(memfree); MPI_Finalize(); }
__kernel void my_thread( void* p) { my_args_t* pargs = (my_args_t*)p; int N = pargs->N, s = pargs->s, d = pargs->d; float *ga = pargs->ga, *gb = pargs->gb, *gc = pargs->gc; int n = N/d; int myrank_2d, mycoords[2]; int dims[2] = {d, d}; int periods[2] = {1, 1}; MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm comm_2d; MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &myrank_2d); MPI_Cart_coords(comm_2d, myrank_2d, 2, mycoords); // Compute ranks of the up and left shifts int uprank, downrank, leftrank, rightrank; MPI_Cart_shift(comm_2d, 0, 1, &leftrank, &rightrank); MPI_Cart_shift(comm_2d, 1, 1, &uprank, &downrank); int x = mycoords[0]; int y = mycoords[1]; // this removes initial skew shift by reading in directly int skew = (x+y) % d; void* memfree = coprthr_tls_sbrk(0); float* a = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); float* b = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); float* c = (float*)coprthr_tls_sbrk(n*n*sizeof(float)); e_dma_desc_t dma_c_read, dma_c_write, dma_a_read, dma_b_read; #define DWORD_WRITE(desc,w,h,W,src,dst) \ e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \ 0x0008, 0x0008, \ w/2, h, \ 8, 4*(W-w+2), \ (void*)src, (void*)dst, &desc) #define DWORD_READ(desc,w,h,W,src,dst) \ e_dma_set_desc(E_DMA_0, (E_DMA_ENABLE|E_DMA_MASTER|E_DMA_DWORD), 0x0000, \ 0x0008, 0x0008, \ w/2, h, \ 4*(W-w+2), 8, \ (void*)src, (void*)dst, &desc) int loop; for(loop=0;loop<LOOP1;loop++) { int i,j,k,l; for (i=0; i<s; i++) { for (j=0; j<s; j++) { float* rgc = gc + ((i*N + y*n)*s + j)*N + x*n; DWORD_WRITE(dma_c_write,n,n,s*N,c,rgc); DWORD_READ(dma_c_read,n,n,s*N,rgc,c); // read C e_dma_start(&dma_c_read, E_DMA_0); e_dma_wait(E_DMA_0); for (k=0; k<s; k++) { float* rga = ga + ((i*N + y*n)*s + k)*N + skew*n; float* rgb = gb + ((k*N + skew*n)*s + j)*N + x*n; // read A and B DWORD_READ(dma_b_read,n,n,s*N,rgb,b); DWORD_READ(dma_a_read,n,n,s*N,rga,a); e_dma_start(&dma_b_read, E_DMA_0); e_dma_wait(E_DMA_0); e_dma_start(&dma_a_read, E_DMA_0); e_dma_wait(E_DMA_0); // transpose B int ji, ii; for (ji=0; ji<n-1; ji++) { for(ii=ji+1; ii<n; ii++) { int tmp = b[ji*n+ii]; b[ji*n+ii] = b[ii*n+ji]; b[ii*n+ji] = tmp; } } int loop; for (loop=0;loop<LOOP3;loop++) { // Get into the main computation loop for (l=1; l<d; l++) { int loop; for(loop=0;loop<LOOP2;loop++) MatrixMultiply(n, a, b, c); // Shift matrix a left by one and shift matrix b up by one MPI_Sendrecv_replace(a, n*n, MPI_FLOAT, leftrank, 1, rightrank, 1, comm_2d, &status); MPI_Sendrecv_replace(b, n*n, MPI_FLOAT, uprank, 1, downrank, 1, comm_2d, &status); } MatrixMultiply(n, a, b, c); } // end LOOP3 } // write C e_dma_start(&dma_c_write, E_DMA_1); e_dma_wait(E_DMA_1); } } } // end LOOP1 coprthr_tls_brk(memfree); MPI_Finalize(); }
__kernel void nbody_thread( void* p ) { my_args_t* pargs = (my_args_t*)p; int n = pargs->n; int cnt = pargs->cnt; unsigned int s_x, s_y, s_z, s_m; unsigned int page = 0; float dt = pargs->dt; float es = pargs->es; Particle *particles = pargs->p; ParticleV *state = pargs->v; int rank, size, npart, i; int left, right; MPI_Status status; MPI_Init(0,MPI_BUF_SIZE); MPI_Comm comm = MPI_COMM_THREAD; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); MPI_Cart_shift(comm, 0, 1, &left, &right); npart = n / size; void* memfree = coprthr_tls_sbrk(0); Particle* my_particles = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle)); ParticleV* my_state = (ParticleV*)coprthr_tls_sbrk(npart*sizeof(ParticleV)); Particle* sendbuf = (Particle*)coprthr_tls_sbrk(npart*sizeof(Particle)); e_dma_copy(my_particles, particles + npart*rank, npart*sizeof(Particle)); e_dma_copy(my_state, state + npart*rank, npart*sizeof(ParticleV)); unsigned int rgba_black = 0x00000000; unsigned int rgba_white = 0x00ffffff; while (cnt--) { for (i=0; i<npart; i++) sendbuf[i] = my_particles[i]; for (i=0; i<size; i++) { if (i) MPI_Sendrecv_replace(sendbuf, sizeof(Particle)/sizeof(float)*npart, MPI_FLOAT, left, 1, right, 1, comm, &status); ComputeAccel(my_particles, sendbuf, my_state, npart, es); } e_dma_copy(particles + npart*rank, my_particles, npart*sizeof(Particle)); ComputeNewPos(my_particles, my_state, npart, dt); for(i = 0; i < npart; i++){ s_x = (int) particles[i + npart*rank].x; s_y = (int) particles[i + npart*rank].y; if(s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){ e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_black, 1 * BPP); } s_x = (int) my_particles[i].x; s_y = (int) my_particles[i].y; if(cnt > 1 && s_x >= 0 && s_x < pargs->fbinfo.xres_virtual && s_y >= 0 && s_y < pargs->fbinfo.yres_virtual){ e_dma_copy((char *) pargs->fbinfo.smem_start + (s_y * pargs->fbinfo.line_length) + (s_x * BPP), (char *) &rgba_white, 1 * BPP); } } } coprthr_tls_brk(memfree); MPI_Finalize(); }
int main(int argc, char **argv) { int rank, M, j,i, *d_graph; int *local_matrix, *row_matrix, *col_matrix, *res_matrix, *rowIds, *colIds; int P, N, q, p_row, p_col; double start, finish; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &P); MPI_Comm_rank(MPI_COMM_WORLD, &rank); //INPUT HANDLED BY THE ROOT PROCESSOR if (rank == ROOT){ scanf("%d", &N); q = check_fox_conditions(P,N); //Check's if the fox's conditions are met if(q == 0){ MPI_Abort(MPI_COMM_WORLD, 0); return 1; //error } d_graph = (int*)malloc((N*N) * sizeof(int)); for(i=0; i < N; i++){ for(j=0; j < N; j++){ scanf("%d", &d_graph[GET_MTRX_POS(i,j,N)]); if (d_graph[GET_MTRX_POS(i,j,N)] == 0 && i != j) { d_graph[GET_MTRX_POS(i,j,N)] = INF; } } } MPI_Bcast(&q, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); if(q > 1) divide_matrix( d_graph, N, q); } else{ MPI_Bcast(&q, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD); } //---------------COMMON------------------ int lngth = N / q; local_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); row_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); col_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); res_matrix = (int*)malloc((lngth*lngth) * sizeof(int)); if(q>1) chnkd_MPI_Recv(local_matrix, lngth*lngth, MPI_INT, 0); else local_matrix = d_graph; p_row = ( rank / q ); p_col = ( rank % q ); //CREATE COMMUNICATORS MPI_Group MPI_GROUP_WORLD; MPI_Comm_group(MPI_COMM_WORLD, &MPI_GROUP_WORLD); MPI_Group row_group, col_group; MPI_Comm row_comm, col_comm, grid_comm; int tmp_row, tmp_col, proc; int row_process_ranks[q], col_process_ranks[q]; for(proc = 0; proc < q; proc++){ row_process_ranks[proc] = (p_row * q) + proc; col_process_ranks[proc] = ((p_col + proc*q) %(q*q)); } radixsort(col_process_ranks, q); radixsort(row_process_ranks, q); MPI_Group_incl(MPI_GROUP_WORLD, q, row_process_ranks, &row_group); MPI_Group_incl(MPI_GROUP_WORLD, q, col_process_ranks, &col_group); MPI_Comm_create(MPI_COMM_WORLD, row_group, &row_comm); MPI_Comm_create(MPI_COMM_WORLD, col_group, &col_comm); if ((rank / q) == (rank % q)) { memcpy(row_matrix, local_matrix, (lngth*lngth) * sizeof(int)); } int ln,d,flag; int step, rotation_src, rotation_dest, src; int count = 0; memcpy(res_matrix, local_matrix, (lngth*lngth) * sizeof(int)); rotation_src = (p_row + 1) % q; rotation_dest = ((p_row - 1) + q) % q; ln = (lngth*q) << 1; start = MPI_Wtime(); for (d = 2; d < ln; d = d << 1) { memcpy(col_matrix, local_matrix, (lngth*lngth) * sizeof(int)); for ( step = 0; step < q; step++) { src = (p_row + step) % q; count++; if (src == p_col) { MPI_Bcast(local_matrix, lngth*lngth, MPI_INT, src, row_comm); floyd_warshall( local_matrix, col_matrix, res_matrix, lngth); } else { MPI_Bcast(row_matrix, lngth*lngth, MPI_INT, src, row_comm); floyd_warshall( row_matrix, col_matrix, res_matrix, lngth); } if( step < q-1) MPI_Sendrecv_replace(col_matrix, lngth*lngth, MPI_INT, rotation_dest, STD_TAG,rotation_src, STD_TAG, col_comm, MPI_STATUS_IGNORE); } memcpy(local_matrix, res_matrix, (lngth*lngth) * sizeof(int)); } int *sol; sol = malloc(N*N*sizeof(int)); MPI_Gather(res_matrix, lngth*lngth, MPI_INT, sol, lngth*lngth, MPI_INT, 0, MPI_COMM_WORLD); if (rank == 0) { finish = MPI_Wtime(); printf("Tempo de execução %f\n",finish - start); } if (rank == 0) { int row, col, pos_x, pos_y, pos, tmp_y, tmp_x; for (i = 0; i < P; i++) { pos_x = i / q; pos_y = i % q; pos = i * lngth*lngth; for (row = 0; row < lngth; row++) { for (col = 0; col < lngth; col++) { tmp_x = GET_MTRX_POS(pos_x,row,lngth); tmp_y = GET_MTRX_POS(pos_y,col,lngth); if (sol[GET_MTRX_POS(row,col,lngth) + pos] == INF) d_graph[GET_MTRX_POS(tmp_x,tmp_y,N)] = 0; else d_graph[GET_MTRX_POS(tmp_x,tmp_y,N)] = sol[GET_MTRX_POS(row,col,lngth) + pos]; } } } prints_matrix(d_graph,N); } MPI_Finalize(); return 0; }
int main (int argc, char **argv) { int row_receive, col_receive; int world_rank, world_size; int source, destination; double start_time, end_time; int i,j; int row_i, column_i, cycle; int rank2; MPI_Comm comm2; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &world_rank); MPI_Comm_size (MPI_COMM_WORLD, &world_size); double root_p; /* sqrt of no of processors */ root_p = sqrt ((double) world_size); if (NRA % (int) root_p != 0) { printf ("Please enter a processor count which a perfect square and a multiple "); MPI_Abort (MPI_COMM_WORLD, 1); } int sub_matrix = NRA / root_p; /* Need to create a grid of root p x root p processors */ dims [0] = (int) root_p; dims [1] = (int) root_p; period [0] = 1; period [1] = 1; /* Now Matrix is made up of sub-matrices of size n/root p */ double sub_A [sub_matrix][sub_matrix]; double sub_B [sub_matrix][sub_matrix]; double sub_C [sub_matrix][sub_matrix]; double sub_CT [sub_matrix][sub_matrix]; /* Now creating a cartesian topology */ MPI_Cart_create (MPI_COMM_WORLD, 2, dims, period, 0, &comm2); /* NOw getting a new rank */ MPI_Comm_rank (comm2, &world_rank); /*Determine process co ordinate based on rank */ MPI_Cart_coords (comm2, world_rank, 2, coordinates); Init_zero_Mat (sub_matrix, sub_C); if (world_rank == 0) { Matrix_init (NRA, NRA, A); Matrix_init (NRA, NRA, B); //print_Mat (NRA, A); //print_Mat (NRA, B); Init_zero_Mat (sub_matrix, sub_C); /* Let us send each portion of A and B and start multiplying */ start_time = MPI_Wtime (); for (i = 0; i < root_p; i++) { for (j = 0; j < root_p; j++) { if ( i != 0 || j != 0) { send_coordinates [0] = i; send_coordinates [1] = j; row_i = -1; int k; for (k = i * sub_matrix; k < i * sub_matrix + sub_matrix; k++) { column_i = 0; row_i ++; int l; for (l = j *sub_matrix; l < j * sub_matrix + sub_matrix; l++) { sub_A[row_i][column_i] = A[k][l]; sub_B[row_i][column_i] = B[k][l]; column_i++; } } /* Make the co ordinate reference to column and send it to processor pij */ send_coordinates [0] = i; send_coordinates [1] = ((j - i) < 0) ? (j-i) + root_p : (j-i); MPI_Cart_rank (comm2, send_coordinates, &rank2); MPI_Send (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE, rank2, 1, comm2); send_coordinates [0] = ((i-j) < 0) ? (i-j) + root_p : i-j; send_coordinates [1] = j; MPI_Cart_rank (comm2, send_coordinates, &rank2); MPI_Send (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE, rank2, 2, comm2); } } } /* NOws send to process 0 */ for (i =0 ; i<sub_matrix; i++) { for ( j = 0; j < sub_matrix; j++) { sub_A[i][j] = A[i][j]; sub_B[i][j] = B[i][j]; } } /* calculate c for matrix in process 0 */ /* Todo: use in function */ for (cycle = 0; cycle < sub_matrix; cycle++) { Matrix_mul (sub_matrix, sub_A, sub_B, sub_C); MPI_Cart_shift (comm2, 1, -1, &source, &destination); MPI_Sendrecv_replace (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 1, source, 1, comm2, &status1); MPI_Cart_shift (comm2, 0, -1, &source, &destination); MPI_Sendrecv_replace (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 2, source, 2, comm2, &status2); } } /*end of master */ else { MPI_Recv (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE, 0, 1, comm2, &status1); MPI_Recv (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE, 0, 2, comm2, &status2); for (cycle = 0; cycle < sub_matrix; cycle ++) { Matrix_mul (sub_matrix, sub_A, sub_B, sub_C); MPI_Cart_shift (comm2, 1, -1, &source, &destination); MPI_Sendrecv_replace (sub_A, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 1, source, 1, comm2, &status1); MPI_Cart_shift (comm2, 0, -1, &source, &destination); MPI_Sendrecv_replace (sub_B, sub_matrix * sub_matrix, MPI_DOUBLE,destination, 2, source, 2, comm2, &status2); } /* send final result to process 0 */ MPI_Send (sub_C, sub_matrix * sub_matrix, MPI_DOUBLE, 0 , world_rank, comm2); } if (world_rank == 0) { Init_zero_Mat (NRA , C); int k; for (i =1; i < world_size; i++) { MPI_Recv (sub_CT, sub_matrix * sub_matrix, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,comm2, &status3); MPI_Cart_coords (comm2, status3.MPI_TAG, 2, send_coordinates); row_receive = send_coordinates [0]; col_receive = send_coordinates [1]; row_i = -1; column_i = 0; for ( j = row_receive * sub_matrix; j < row_receive * sub_matrix + sub_matrix; j++) { row_i ++; for ( k = col_receive * sub_matrix; k < col_receive * sub_matrix + sub_matrix; k++) { C[j][k] = sub_CT[row_i][column_i]; column_i ++; } column_i = 0; } } /* On process 0 */ for (i = 0; i < sub_matrix; i++) { for (j =0 ; j <sub_matrix; j++) { C[i][j] = sub_C[i][j]; } } end_time = MPI_Wtime (); double serial = Verify (NRA, A,B,C); printf ("speedup :%f s",serial/( end_time - start_time)); } MPI_Finalize (); return 0; }
/* This program is from mpich/tsuite/pt2pt and should be changed there only. It needs gcomm and dtype from mpich/tsuite, and can be run with any number of processes > 1. This version uses sendrecv and sendrecv_replace (but only in the head-to-head mode). */ int main( int argc, char **argv ) { MPI_Datatype *types; void **inbufs, **outbufs; char **names; int *counts, *bytesize, ntype; MPI_Comm comms[20]; int ncomm = 20, rank, np, partner, tag, count; int i, j, k, err, world_rank, errloc; MPI_Status status; char *obuf, *ibuf; MPI_Init( &argc, &argv ); AllocateForData( &types, &inbufs, &outbufs, &counts, &bytesize, &names, &ntype ); GenerateData( types, inbufs, outbufs, counts, bytesize, names, &ntype ); MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); MakeComms( comms, 20, &ncomm, 0 ); /* Test over a wide range of datatypes and communicators */ err = 0; for (i=0; i<ncomm; i++) { MPI_Comm_rank( comms[i], &rank ); MPI_Comm_size( comms[i], &np ); if (np < 2) continue; tag = i; if (rank == 0) partner = np - 1; if (rank == np - 1) partner = 0; for (j=0; j<ntype; j++) { if (world_rank == 0) fprintf( stdout, "Testing type %s\n", names[j] ); if (rank == 0 || rank == np - 1) { obuf = outbufs[j]; for (k=0; k<bytesize[j]; k++) obuf[k] = 0; MPI_Sendrecv( inbufs[j], counts[j], types[j], partner, tag, outbufs[j], counts[j], types[j], partner, tag, comms[i], &status ); /* Test correct */ MPI_Get_count( &status, types[j], &count ); if (count != counts[j]) { fprintf( stderr, "Error in counts (got %d expected %d) with type %s\n", count, counts[j], names[j] ); err++; } if (status.MPI_SOURCE != partner) { fprintf( stderr, "Error in source (got %d expected %d) with type %s\n", status.MPI_SOURCE, partner, names[j] ); err++; } if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) { char *p1, *p2; fprintf( stderr, "Error in data with type %s (type %d on %d) at byte %d\n", names[j], j, world_rank, errloc - 1 ); p1 = (char *)inbufs[j]; p2 = (char *)outbufs[j]; fprintf( stderr, "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] ); err++; } /* Now do sendrecv_replace */ obuf = outbufs[j]; ibuf = inbufs[j]; for (k=0; k<bytesize[j]; k++) obuf[k] = ibuf[k]; /* This would be a better test if the data was different... */ MPI_Sendrecv_replace( obuf, counts[j], types[j], partner, tag, partner, tag, comms[i], &status ); /* Test correct */ MPI_Get_count( &status, types[j], &count ); if (count != counts[j]) { fprintf( stderr, "Error in counts (got %d expected %d) with type %s\n", count, counts[j], names[j] ); err++; } if (status.MPI_SOURCE != partner) { fprintf( stderr, "Error in source (got %d expected %d) with type %s\n", status.MPI_SOURCE, partner, names[j] ); err++; } if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) { char *p1, *p2; fprintf( stderr, "Error in data with type %s (type %d on %d) at byte %d\n", names[j], j, world_rank, errloc - 1 ); p1 = (char *)inbufs[j]; p2 = (char *)outbufs[j]; fprintf( stderr, "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] ); err++; } } } } if (err > 0) { fprintf( stderr, "%d errors on %d\n", err, rank ); } FreeDatatypes( types, inbufs, outbufs, counts, bytesize, names, ntype ); FreeComms( comms, ncomm ); MPI_Finalize(); return err; }
int main (int argc, char **argv) { FILE *fp; double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL; double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL; int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size; int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size; int rank, size, sqrt_size, matrices_a_b_dimensions[4]; MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator; MPI_Status status; // used to manage the cartesian grid int dimensions[2], periods[2], coordinates[2], remain_dims[2]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* For square mesh */ sqrt_size = (int)sqrt((double) size); if(sqrt_size * sqrt_size != size){ if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n"); MPI_Abort(MPI_COMM_WORLD, -1); } // create a 2D cartesian grid dimensions[0] = dimensions[1] = sqrt_size; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator); MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); //v COORDINATES imas shranjene koordinate procesa RANK // create a row communicator remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator); // create a column communicator remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator); // set time variables for different MPI parts double read_time, send_dim_time, send_blocks_time, gather_time, write_time, dod_cajt; read_time = MPI_Wtime(); // getting matrices from files at rank 0 only // example: mpiexec -n 64 ./cannon matrix1 matrix2 [test] if (rank == 0){ int row, column; if ((fp = fopen (argv[1], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]); A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *)); for (row = 0; row < matrices_a_b_dimensions[0]; row++){ A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double)); for (column = 0; column < matrices_a_b_dimensions[1]; column++) fscanf(fp, "%lf", &A[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, -1); } if((fp = fopen (argv[2], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]); B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *)); for(row = 0; row < matrices_a_b_dimensions[2]; row++){ B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *)); for(column = 0; column < matrices_a_b_dimensions[3]; column++) fscanf(fp, "%lf", &B[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // need to check that the multiplication is possible given dimensions // matrices_a_b_dimensions[0] = row size of A // matrices_a_b_dimensions[1] = column size of A // matrices_a_b_dimensions[2] = row size of B // matrices_a_b_dimensions[3] = column size of B if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){ if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // this implementation is limited to cases where thematrices can be partitioned perfectly if( matrices_a_b_dimensions[0] % sqrt_size != 0 || matrices_a_b_dimensions[1] % sqrt_size != 0 || matrices_a_b_dimensions[2] % sqrt_size != 0 || matrices_a_b_dimensions[3] % sqrt_size != 0 ){ if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processes\n" "all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n", matrices_a_b_dimensions[0],matrices_a_b_dimensions[1], matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size ); MPI_Abort(MPI_COMM_WORLD, -1); } } read_time -= MPI_Wtime(); send_dim_time = MPI_Wtime(); // send dimensions to all peers //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //has to be blocking, bcs data is used right afterwards... MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% send_dim_time -= MPI_Wtime(); A_rows = matrices_a_b_dimensions[0]; A_columns = matrices_a_b_dimensions[1]; B_rows = matrices_a_b_dimensions[2]; B_columns = matrices_a_b_dimensions[3]; // local metadata for A A_local_block_rows = A_rows / sqrt_size; A_local_block_columns = A_columns / sqrt_size; A_local_block_size = A_local_block_rows * A_local_block_columns; A_local_block = (double *) malloc (A_local_block_size * sizeof(double)); // local metadata for B B_local_block_rows = B_rows / sqrt_size; B_local_block_columns = B_columns / sqrt_size; B_local_block_size = B_local_block_rows * B_local_block_columns; B_local_block = (double *) malloc (B_local_block_size * sizeof(double)); // local metadata for C C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double)); // C needs to be initialized at 0 (accumulates partial dot-products) int i,j; for(i=0; i < A_local_block_rows * B_local_block_columns; i++){ C_local_block[i] = 0; } dod_cajt = MPI_Wtime(); // full arrays only needed at root if(rank == 0){ A_array = (double *) malloc(sizeof(double) * A_rows * A_columns); B_array = (double *) malloc(sizeof(double) * B_rows * B_columns); C_array = (double *) malloc(sizeof(double) * A_rows * B_columns); // generate the 1D arrays of the matrices at root int row, column, i, j; for (i = 0; i < sqrt_size; i++){ for (j = 0; j < sqrt_size; j++){ for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < A_local_block_columns; column++){ A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = A[i * A_local_block_rows + row][j * A_local_block_columns + column]; } } for (row = 0; row < B_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = B[i * B_local_block_rows + row][j * B_local_block_columns + column]; } } } } // allocate output matrix C C = (double **) malloc(A_rows * sizeof(double *)); for(i=0; i<A_rows ;i++){ C[i] = (double *) malloc(B_columns * sizeof(double)); } } dod_cajt -= MPI_Wtime(); //a bi mogla dat to v send blocks time? // send a block to each process //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /*MPI_Scatter(A_array, A_local_block_size, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); MPI_Scatter(B_array, B_local_block_size, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); */ send_blocks_time = MPI_Wtime(); int displsA[size]; int displsB[size]; int localblsizA[size]; int localblsizB[size]; MPI_Request requests[2]; MPI_Status statuses[2]; for (i=0; i<sqrt_size; i++){ for (j=0; j<sqrt_size; j++){ displsA[i*sqrt_size + j] = (i*sqrt_size + j)*A_local_block_size; //(i*sqrt_size + (j+i)%sqrt_size)*A_local_block_size; displsB[i*sqrt_size + j] = (i*sqrt_size + j)*B_local_block_size; //(j + ((j+i)%size)*sqrt_size)*B_local_block_size; localblsizA[i*sqrt_size+j] = A_local_block_size; localblsizB[i*sqrt_size+j] = B_local_block_size; } } MPI_Iscatterv(A_array, localblsizA, displsA, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[0]); MPI_Iscatterv(B_array, localblsizB, displsB, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[1]); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // fix initial arrangements before the core algorithm starts - fora je, da se preden se prvic zacne computational part of algo, moras ze bloke zamenjat... /*if(coordinates[0] != 0){ MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0, (coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status); } if(coordinates[1] != 0){ MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0, (coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status); }*/ // cannon's algorithm int cannon_block_cycle; double compute_time = 0, mpi_time = 0, start; int C_index, A_row, A_column, B_column; MPI_Waitall(2, requests, statuses); send_blocks_time -= MPI_Wtime(); for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){ // compute partial result for this block cycle start = MPI_Wtime(); for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){ for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){ for(A_column = 0; A_column < A_local_block_columns; A_column++){ C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * B_local_block[A_column * B_local_block_columns + B_column]; } } } compute_time += MPI_Wtime() - start; start = MPI_Wtime(); // rotate blocks horizontally MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, //to bi slo z MPI_alltoallv, in tisto variablo za replacing. ampak bi blo inefficient - glej komentarje! (coordinates[1] + sqrt_size - 1) % sqrt_size, 0, (coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status); // rotate blocks vertically MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - 1) % sqrt_size, 0, (coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status); mpi_time += MPI_Wtime() - start; } // get C parts from other processes at rank 0 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% gather_time = MPI_Wtime(); MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, cartesian_grid_communicator); //blocking, ker gres takoj nekaj delat s tem pol... right? gather_time -= MPI_Wtime(); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // generating output at rank 0 if (rank == 0) { write_time = MPI_Wtime(); // convert the ID array into the actual C matrix int i, j, k, row, column; for (i = 0; i < sqrt_size; i++){ // block row index for (j = 0; j < sqrt_size; j++){ // block column index for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) + (row * B_local_block_columns) + column]; } } } } write_time -= MPI_Wtime(); printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns); printf("Computation time: %lf\n", compute_time); printf("MPI time: %lf\n", mpi_time); printf("Read time: %lf\n", -read_time); printf("Send dims time: %lf\n", -send_dim_time); printf("Send blocks time: %lf\n", -send_blocks_time); printf("Gather time: %lf\n", -gather_time); printf("Addit. time: %lf\n", -dod_cajt); printf("Write time: %lf\n", -write_time); if (argc == 4){ // present results on the screen printf("\nA( %d x %d ):\n", A_rows, A_columns); for(row = 0; row < A_rows; row++) { for(column = 0; column < A_columns; column++) printf ("%7.3f ", A[row][column]); printf ("\n"); } printf("\nB( %d x %d ):\n", B_rows, B_columns); for(row = 0; row < B_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ", B[row][column]); printf("\n"); } printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns); for(row = 0; row < A_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ",C[row][column]); printf("\n"); } printf("\nPerforming serial consistency check. Be patient...\n"); fflush(stdout); int pass = 1; double temp; for(i=0; i<A_rows; i++){ for(j=0; j<B_columns; j++){ temp = 0; for(k=0; k<B_rows; k++){ temp += A[i][k] * B[k][j]; } //printf("%7.3f ", temp); printf("%7.3f ", temp-C[i][j]); if(temp != C[i][j]){ pass = 0; } } printf("\n"); } if (pass) printf("Consistency check: PASS\n"); else printf("Consistency check: FAIL\n"); } } // free all memory if(rank == 0){ int i; for(i = 0; i < A_rows; i++){ free(A[i]); } for(i = 0; i < B_rows; i++){ free(B[i]); } for(i = 0; i < A_rows; i++){ free(C[i]); } free(A); free(B); free(C); free(A_array); free(B_array); free(C_array); } free(A_local_block); free(B_local_block); free(C_local_block); // finalize MPI MPI_Finalize(); }
void declareBindings (void) { /* === Point-to-point === */ void* buf; int count; MPI_Datatype datatype; int dest; int tag; MPI_Comm comm; MPI_Send (buf, count, datatype, dest, tag, comm); // L12 int source; MPI_Status status; MPI_Recv (buf, count, datatype, source, tag, comm, &status); // L15 MPI_Get_count (&status, datatype, &count); MPI_Bsend (buf, count, datatype, dest, tag, comm); MPI_Ssend (buf, count, datatype, dest, tag, comm); MPI_Rsend (buf, count, datatype, dest, tag, comm); void* buffer; int size; MPI_Buffer_attach (buffer, size); // L22 MPI_Buffer_detach (buffer, &size); MPI_Request request; MPI_Isend (buf, count, datatype, dest, tag, comm, &request); // L25 MPI_Ibsend (buf, count, datatype, dest, tag, comm, &request); MPI_Issend (buf, count, datatype, dest, tag, comm, &request); MPI_Irsend (buf, count, datatype, dest, tag, comm, &request); MPI_Irecv (buf, count, datatype, source, tag, comm, &request); MPI_Wait (&request, &status); int flag; MPI_Test (&request, &flag, &status); // L32 MPI_Request_free (&request); MPI_Request* array_of_requests; int index; MPI_Waitany (count, array_of_requests, &index, &status); // L36 MPI_Testany (count, array_of_requests, &index, &flag, &status); MPI_Status* array_of_statuses; MPI_Waitall (count, array_of_requests, array_of_statuses); // L39 MPI_Testall (count, array_of_requests, &flag, array_of_statuses); int incount; int outcount; int* array_of_indices; MPI_Waitsome (incount, array_of_requests, &outcount, array_of_indices, array_of_statuses); // L44--45 MPI_Testsome (incount, array_of_requests, &outcount, array_of_indices, array_of_statuses); // L46--47 MPI_Iprobe (source, tag, comm, &flag, &status); // L48 MPI_Probe (source, tag, comm, &status); MPI_Cancel (&request); MPI_Test_cancelled (&status, &flag); MPI_Send_init (buf, count, datatype, dest, tag, comm, &request); MPI_Bsend_init (buf, count, datatype, dest, tag, comm, &request); MPI_Ssend_init (buf, count, datatype, dest, tag, comm, &request); MPI_Rsend_init (buf, count, datatype, dest, tag, comm, &request); MPI_Recv_init (buf, count, datatype, source, tag, comm, &request); MPI_Start (&request); MPI_Startall (count, array_of_requests); void* sendbuf; int sendcount; MPI_Datatype sendtype; int sendtag; void* recvbuf; int recvcount; MPI_Datatype recvtype; MPI_Datatype recvtag; MPI_Sendrecv (sendbuf, sendcount, sendtype, dest, sendtag, recvbuf, recvcount, recvtype, source, recvtag, comm, &status); // L67--69 MPI_Sendrecv_replace (buf, count, datatype, dest, sendtag, source, recvtag, comm, &status); // L70--71 MPI_Datatype oldtype; MPI_Datatype newtype; MPI_Type_contiguous (count, oldtype, &newtype); // L74 int blocklength; { int stride; MPI_Type_vector (count, blocklength, stride, oldtype, &newtype); // L78 } { MPI_Aint stride; MPI_Type_hvector (count, blocklength, stride, oldtype, &newtype); // L82 } int* array_of_blocklengths; { int* array_of_displacements; MPI_Type_indexed (count, array_of_blocklengths, array_of_displacements, oldtype, &newtype); // L87--88 } { MPI_Aint* array_of_displacements; MPI_Type_hindexed (count, array_of_blocklengths, array_of_displacements, oldtype, &newtype); // L92--93 MPI_Datatype* array_of_types; MPI_Type_struct (count, array_of_blocklengths, array_of_displacements, array_of_types, &newtype); // L95--96 } void* location; MPI_Aint address; MPI_Address (location, &address); // L100 MPI_Aint extent; MPI_Type_extent (datatype, &extent); // L102 MPI_Type_size (datatype, &size); MPI_Aint displacement; MPI_Type_lb (datatype, &displacement); // L105 MPI_Type_ub (datatype, &displacement); MPI_Type_commit (&datatype); MPI_Type_free (&datatype); MPI_Get_elements (&status, datatype, &count); void* inbuf; void* outbuf; int outsize; int position; MPI_Pack (inbuf, incount, datatype, outbuf, outsize, &position, comm); // L114 int insize; MPI_Unpack (inbuf, insize, &position, outbuf, outcount, datatype, comm); // L116--117 MPI_Pack_size (incount, datatype, comm, &size); /* === Collectives === */ MPI_Barrier (comm); // L121 int root; MPI_Bcast (buffer, count, datatype, root, comm); // L123 MPI_Gather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); // L124--125 int* recvcounts; int* displs; MPI_Gatherv (sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm); // L128--130 MPI_Scatter (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); // L131--132 int* sendcounts; MPI_Scatterv (sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm); // L134--135 MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); // L136--137 MPI_Allgatherv (sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm); // L138--140 MPI_Alltoall (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); // L141--142 int* sdispls; int* rdispls; MPI_Alltoallv (sendbuf, sendcounts, sdispls, sendtype, recvbuf, recvcounts, rdispls, recvtype, comm); // L145--147 MPI_Op op; MPI_Reduce (sendbuf, recvbuf, count, datatype, op, root, comm); // L149 #if 0 MPI_User_function function; int commute; MPI_Op_create (function, commute, &op); // L153 #endif MPI_Op_free (&op); // L155 MPI_Allreduce (sendbuf, recvbuf, count, datatype, op, comm); MPI_Reduce_scatter (sendbuf, recvbuf, recvcounts, datatype, op, comm); MPI_Scan (sendbuf, recvbuf, count, datatype, op, comm); /* === Groups, contexts, and communicators === */ MPI_Group group; MPI_Group_size (group, &size); // L162 int rank; MPI_Group_rank (group, &rank); // L164 MPI_Group group1; int n; int* ranks1; MPI_Group group2; int* ranks2; MPI_Group_translate_ranks (group1, n, ranks1, group2, ranks2); // L170 int result; MPI_Group_compare (group1, group2, &result); // L172 MPI_Group newgroup; MPI_Group_union (group1, group2, &newgroup); // L174 MPI_Group_intersection (group1, group2, &newgroup); MPI_Group_difference (group1, group2, &newgroup); int* ranks; MPI_Group_incl (group, n, ranks, &newgroup); // L178 MPI_Group_excl (group, n, ranks, &newgroup); extern int ranges[][3]; MPI_Group_range_incl (group, n, ranges, &newgroup); // L181 MPI_Group_range_excl (group, n, ranges, &newgroup); MPI_Group_free (&group); MPI_Comm_size (comm, &size); MPI_Comm_rank (comm, &rank); MPI_Comm comm1; MPI_Comm comm2; MPI_Comm_compare (comm1, comm2, &result); MPI_Comm newcomm; MPI_Comm_dup (comm, &newcomm); MPI_Comm_create (comm, group, &newcomm); int color; int key; MPI_Comm_split (comm, color, key, &newcomm); // L194 MPI_Comm_free (&comm); MPI_Comm_test_inter (comm, &flag); MPI_Comm_remote_size (comm, &size); MPI_Comm_remote_group (comm, &group); MPI_Comm local_comm; int local_leader; MPI_Comm peer_comm; int remote_leader; MPI_Comm newintercomm; MPI_Intercomm_create (local_comm, local_leader, peer_comm, remote_leader, tag, &newintercomm); // L204--205 MPI_Comm intercomm; MPI_Comm newintracomm; int high; MPI_Intercomm_merge (intercomm, high, &newintracomm); // L209 int keyval; #if 0 MPI_Copy_function copy_fn; MPI_Delete_function delete_fn; void* extra_state; MPI_Keyval_create (copy_fn, delete_fn, &keyval, extra_state); // L215 #endif MPI_Keyval_free (&keyval); // L217 void* attribute_val; MPI_Attr_put (comm, keyval, attribute_val); // L219 MPI_Attr_get (comm, keyval, attribute_val, &flag); MPI_Attr_delete (comm, keyval); /* === Environmental inquiry === */ char* name; int resultlen; MPI_Get_processor_name (name, &resultlen); // L226 MPI_Errhandler errhandler; #if 0 MPI_Handler_function function; MPI_Errhandler_create (function, &errhandler); // L230 #endif MPI_Errhandler_set (comm, errhandler); // L232 MPI_Errhandler_get (comm, &errhandler); MPI_Errhandler_free (&errhandler); int errorcode; char* string; MPI_Error_string (errorcode, string, &resultlen); // L237 int errorclass; MPI_Error_class (errorcode, &errorclass); // L239 MPI_Wtime (); MPI_Wtick (); int argc; char** argv; MPI_Init (&argc, &argv); // L244 MPI_Finalize (); MPI_Initialized (&flag); MPI_Abort (comm, errorcode); }
void exchange_v(float ** vy, float ** bufferlef_to_rig, float ** bufferrig_to_lef, float ** buffertop_to_bot, float ** bufferbot_to_top, MPI_Request * req_send, MPI_Request * req_rec){ extern int NX, NY, POS[3], NPROCX, NPROCY, BOUNDARY, FDORDER; extern int INDEX[5]; extern const int TAG1,TAG2,TAG5,TAG6; MPI_Status status; int i, j, fdo, fdo3, n, l; fdo = FDORDER/2 + 1; fdo3 = 2*fdo; /* top - bottom */ if (POS[2]!=0) /* no boundary exchange at top of global grid */ for (i=1;i<=NX;i++){ n = 1; /* storage of top of local volume into buffer */ for (l=1;l<=fdo-1;l++) { buffertop_to_bot[i][n++] = vy[l][i]; } } if (POS[2]!=NPROCY-1) /* no boundary exchange at bottom of global grid */ for (i=1;i<=NX;i++){ /* storage of bottom of local volume into buffer */ n = 1; /*for (l=1;l<=fdo;l++) { bufferbot_to_top[i][n++] = vy[NY-l+1][i]; }*/ } /* send and reveive values for points at inner boundaries */ /* MPI_Bsend(&buffertop_to_bot[1][1],NX*fdo3,MPI_FLOAT,INDEX[3],TAG5,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Recv(&buffertop_to_bot[1][1],NX*fdo3,MPI_FLOAT,INDEX[4],TAG5,MPI_COMM_WORLD,&status); MPI_Bsend(&bufferbot_to_top[1][1],NX*fdo3,MPI_FLOAT,INDEX[4],TAG6,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Recv(&bufferbot_to_top[1][1],NX*fdo3,MPI_FLOAT,INDEX[3],TAG6,MPI_COMM_WORLD,&status); */ /* send and reveive values at edges of the local grid */ /*for (i=2;i<=3;i++){ MPI_Start(&req_send[i]); MPI_Wait(&req_send[i],&status); MPI_Start(&req_rec[i]); MPI_Wait(&req_rec[i],&status); }*/ /* alternative communication */ /* still blocking communication */ MPI_Sendrecv_replace(&buffertop_to_bot[1][1],NX*fdo3,MPI_FLOAT,INDEX[3],TAG5,INDEX[4],TAG5,MPI_COMM_WORLD,&status); /*MPI_Sendrecv_replace(&bufferbot_to_top[1][1],NX*fdo3,MPI_FLOAT,INDEX[4],TAG6,INDEX[3],TAG6,MPI_COMM_WORLD,&status);*/ if (POS[2]!=NPROCY-1) /* no boundary exchange at bottom of global grid */ for (i=1;i<=NX;i++){ n = 1; for (l=1;l<=fdo-1;l++) { vy[NY+l][i] = buffertop_to_bot[i][n++]; } /*for (l=1;l<=fdo;l++) { vx[NY+l][i] = buffertop_to_bot[i][n++]; }*/ } if (POS[2]!=0) /* no boundary exchange at top of global grid */ for (i=1;i<=NX;i++){ n = 1; /*for (l=1;l<=fdo;l++) { vy[1-l][i] = bufferbot_to_top[i][n++]; }*/ /*for (l=1;l<=fdo-1;l++) { vx[1-l][i] = bufferbot_to_top[i][n++]; }*/ } /* left - right */ /* exchange if periodic boundary condition is applied */ if ((BOUNDARY) || (POS[1]!=0)) for (j=1;j<=NY;j++){ /* storage of left edge of local volume into buffer */ n = 1; for (l=1;l<fdo;l++) { bufferlef_to_rig[j][n++] = vy[j][l]; } } /* no exchange if periodic boundary condition is applied */ if ((BOUNDARY) || (POS[1]!=NPROCX-1)) /* no boundary exchange at right edge of global grid */ for (j=1;j<=NY;j++){ /* storage of right edge of local volume into buffer */ n = 1; /*for (l=1;l<fdo-1;l++) { bufferrig_to_lef[j][n++] = vy[j][NX-l+1]; }*/ } /* send and reveive values for points at inner boundaries */ /* MPI_Bsend(&bufferlef_to_rig[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[1],TAG1,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Recv(&bufferlef_to_rig[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[2],TAG1,MPI_COMM_WORLD,&status); MPI_Bsend(&bufferrig_to_lef[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[2],TAG2,MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); MPI_Recv(&bufferrig_to_lef[1][1],(NY)*fdo3,MPI_FLOAT,INDEX[1],TAG2,MPI_COMM_WORLD,&status); */ /* send and reveive values at edges of the local grid */ /*for (i=0;i<=1;i++){ MPI_Start(&req_send[i]); MPI_Wait(&req_send[i],&status); MPI_Start(&req_rec[i]); MPI_Wait(&req_rec[i],&status); }*/ /* alternative communication */ /* still blocking communication */ MPI_Sendrecv_replace(&bufferlef_to_rig[1][1],NY*fdo3,MPI_FLOAT,INDEX[1],TAG1,INDEX[2],TAG1,MPI_COMM_WORLD,&status); /*MPI_Sendrecv_replace(&bufferrig_to_lef[1][1],NY*fdo3,MPI_FLOAT,INDEX[2],TAG2,INDEX[1],TAG2,MPI_COMM_WORLD,&status);*/ /* no exchange if periodic boundary condition is applied */ if ((BOUNDARY) || (POS[1]!=NPROCX-1)) /* no boundary exchange at right edge of global grid */ for (j=1;j<=NY;j++){ n = 1; for (l=1;l<fdo;l++) { vy[j][NX+l] = bufferlef_to_rig[j][n++]; } } /* no exchange if periodic boundary condition is applied */ if ((BOUNDARY) || (POS[1]!=0)) /* no boundary exchange at left edge of global grid */ for (j=1;j<=NY;j++){ n = 1; /*for (l=1;l<fdo-1;l++) { vy[j][1-l] = bufferrig_to_lef[j][n++]; }*/ } }
/* Performs Gaussian elimination on the given matrix of doubles. The * parameter numRows gives the number of rows in the matrix, and * numCols the number of columns. Upon return, the matrix will be in * reduced row-echelon form. */ int gausselim(double* matrix, int numRows, int numCols, int debug) { int top = 0; // the current top row of the matrix int col = 0; // column index of the current pivot int pivotRow = 0; // row index of current pivot double pivot = 0.0; // the value of the current pivot int j = 0; // loop variable over columns of matrix double tmp = 0.0; // temporary double variable MPI_Status status; // status object needed for receives int rank; // rank of this process int nprocs; // number of processes double* toprow = (double*)malloc(numCols * sizeof(double)); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); for (top=col=0; top<numRows && col< numCols; top++, col++) { /* At this point we know that the submatrix consisting of the * first top rows of A is in reduced row-echelon form. We will now * consider the submatrix B consisting of the remaining rows. We * know, additionally, that the first col columns of B are * all zero. */ if (debug && rank == 0) { printf("Top: %d\n", top); } /* Step 1: Locate the leftmost column of B that does not consist * of all zeros, if one exists. The top nonzero entry of this * column is the pivot. */ for (; col < numCols; col++) { if (matrix[col] != 0.0 && rank >= top) { MPI_Allreduce(&rank, &pivotRow, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); } else { MPI_Allreduce(&nprocs, &pivotRow, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); } if (pivotRow < nprocs){ break; } } if (col >= numCols) { break; } if (debug) { if (rank == 0) { printf("Step 1 result: col=%d, pivotRow=%d\n\n", col, pivotRow); } } /* At this point we are guaranteed that pivot = A[pivotRow,col] is * nonzero. We also know that all the columns of B to the left of * col consist entirely of zeros. */ /* Step 2: Interchange the top row with the pivot row, if * necessary, so that the entry at the top of the column found in * Step 1 is nonzero. */ if (pivotRow != top) { if (rank == top) { MPI_Sendrecv_replace(matrix, numCols, MPI_DOUBLE, pivotRow, 0, pivotRow, 0, MPI_COMM_WORLD, &status); } else if (rank == pivotRow) { MPI_Sendrecv_replace(matrix, numCols, MPI_DOUBLE, top, 0, top, 0, MPI_COMM_WORLD, &status); } } if (rank == top) { pivot = matrix[col]; } if (debug) { printMatrix("Step 2 result: \n", matrix, numRows, numCols); } /* At this point we are guaranteed that A[top,col] = pivot is * nonzero. Also, we know that (i>=top and j<col) implies * A[i,j] = 0. */ /* Step 3: Divide the top row by pivot in order to introduce a * leading 1. */ if (rank == top) { for (j = col; j < numCols; j++) { matrix[j] /= pivot; toprow[j] = matrix[j]; } } if (debug) { printMatrix("Step 3 result:\n", matrix, numRows, numCols); } /* At this point we are guaranteed that A[top,col] is 1.0, * assuming that floating point arithmetic guarantees that a/a * equals 1.0 for any nonzero double a. */ MPI_Bcast(toprow, numCols, MPI_DOUBLE, top, MPI_COMM_WORLD); /* Step 4: Add suitable multiples of the top row to rows below so * that all entries below the leading 1 become zero. */ if (rank != top) { tmp = matrix[col]; for (j = col; j < numCols; j++) { matrix[j] -= toprow[j]*tmp; } } if (debug) { printMatrix("Step 4 result: \n", matrix, numRows, numCols); } } free(toprow); return 0; }
static void test_pair (void) { int prev, next, count, tag, index, i, outcount, indices[2]; int rank, size, flag, ierr, reqcount; double send_buf[TEST_SIZE], recv_buf[TEST_SIZE]; double buffered_send_buf[TEST_SIZE * 2 + MPI_BSEND_OVERHEAD]; /* factor of two is based on guessing - only dynamic allocation would be safe */ void *buffer; MPI_Status statuses[2]; MPI_Status status; MPI_Request requests[2]; MPI_Comm dupcom, intercom; #ifdef V_T struct _VT_FuncFrameHandle { char *name; int func; int frame; }; typedef struct _VT_FuncFrameHandle VT_FuncFrameHandle_t; VT_FuncFrameHandle_t normal_sends, buffered_sends, buffered_persistent_sends, ready_sends, sync_sends, nblock_sends, nblock_rsends, nblock_ssends, pers_sends, pers_rsends, pers_ssends, sendrecv, sendrecv_repl, intercomm; int classid; VT_classdef( "Application:test_pair", &classid ); #define VT_REGION_DEF( _name, _nameframe, _class ) \ (_nameframe).name=_name; \ VT_funcdef( (_nameframe).name, _class, &((_nameframe).func) ); #define VT_BEGIN_REGION( _nameframe ) \ LOCDEF(); \ VT_begin( (_nameframe).func ) #define VT_END_REGION( _nameframe ) \ LOCDEF(); VT_end( (_nameframe).func ) #else #define VT_REGION_DEF( _name, _nameframe, _class ) #define VT_BEGIN_REGION( _nameframe ) #define VT_END_REGION( _nameframe ) #endif ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); ierr = MPI_Comm_size(MPI_COMM_WORLD, &size); if ( size < 2 ) { if ( rank == 0 ) { printf("Program needs to be run on at least 2 processes.\n"); } ierr = MPI_Abort( MPI_COMM_WORLD, 66 ); } ierr = MPI_Comm_dup(MPI_COMM_WORLD, &dupcom); if ( rank >= 2 ) { /* printf( "%d Calling finalize.\n", rank ); */ ierr = MPI_Finalize( ); exit(0); } next = rank + 1; if (next >= 2) next = 0; prev = rank - 1; if (prev < 0) prev = 1; VT_REGION_DEF( "Normal_Sends", normal_sends, classid ); VT_REGION_DEF( "Buffered_Sends", buffered_sends, classid ); VT_REGION_DEF( "Buffered_Persistent_Sends", buffered_persistent_sends, classid ); VT_REGION_DEF( "Ready_Sends", ready_sends, classid ); VT_REGION_DEF( "Sync_Sends", sync_sends, classid ); VT_REGION_DEF( "nblock_Sends", nblock_sends, classid ); VT_REGION_DEF( "nblock_RSends", nblock_rsends, classid ); VT_REGION_DEF( "nblock_SSends", nblock_ssends, classid ); VT_REGION_DEF( "Pers_Sends", pers_sends, classid ); VT_REGION_DEF( "Pers_RSends", pers_rsends, classid ); VT_REGION_DEF( "Pers_SSends", pers_ssends, classid ); VT_REGION_DEF( "SendRecv", sendrecv, classid ); VT_REGION_DEF( "SendRevc_Repl", sendrecv_repl, classid ); VT_REGION_DEF( "InterComm", intercomm, classid ); /* * Normal sends */ VT_BEGIN_REGION( normal_sends ); if (rank == 0) printf ("Send\n"); tag = 0x100; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); LOCDEF(); MPI_Send(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check(recv_buf, prev, tag, count, &status, TEST_SIZE, "send and recv"); } else { LOCDEF(); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,"send and recv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( normal_sends ); /* * Buffered sends */ VT_BEGIN_REGION( buffered_sends ); if (rank == 0) printf ("Buffered Send\n"); tag = 138; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); LOCDEF(); MPI_Buffer_attach(buffered_send_buf, sizeof(buffered_send_buf)); MPI_Bsend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); MPI_Buffer_detach(&buffer, &size); if(buffer != buffered_send_buf || size != sizeof(buffered_send_buf)) { printf ("[%d] Unexpected buffer returned by MPI_Buffer_detach(): %p/%d != %p/%d\n", rank, buffer, size, buffered_send_buf, (int)sizeof(buffered_send_buf)); MPI_Abort(MPI_COMM_WORLD, 201); } MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check(recv_buf, prev, tag, count, &status, TEST_SIZE, "send and recv"); } else { LOCDEF(); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,"send and recv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( buffered_sends ); /* * Buffered sends */ VT_BEGIN_REGION( buffered_persistent_sends ); if (rank == 0) printf ("Buffered Persistent Send\n"); tag = 238; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); LOCDEF(); MPI_Buffer_attach(buffered_send_buf, sizeof(buffered_send_buf)); MPI_Bsend_init(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, requests); MPI_Start(requests); MPI_Wait(requests, statuses); MPI_Request_free(requests); MPI_Buffer_detach(&buffer, &size); if(buffer != buffered_send_buf || size != sizeof(buffered_send_buf)) { printf ("[%d] Unexpected buffer returned by MPI_Buffer_detach(): %p/%d != %p/%d\n", rank, buffer, size, buffered_send_buf, (int)sizeof(buffered_send_buf)); MPI_Abort(MPI_COMM_WORLD, 201); } MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check(recv_buf, prev, tag, count, &status, TEST_SIZE, "send and recv"); } else { LOCDEF(); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE,"send and recv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( buffered_persistent_sends ); /* * Ready sends. Note that we must insure that the receive is posted * before the rsend; this requires using Irecv. */ VT_BEGIN_REGION( ready_sends ); if (rank == 0) printf ("Rsend\n"); tag = 1456; count = TEST_SIZE / 3; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); MPI_Recv(MPI_BOTTOM, 0, MPI_INT, next, tag, MPI_COMM_WORLD, &status); MPI_Rsend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); MPI_Probe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &status); if (status.MPI_SOURCE != prev) printf ("Incorrect src, expected %d, got %d\n",prev, status.MPI_SOURCE); if (status.MPI_TAG != tag) printf ("Incorrect tag, expected %d, got %d\n",tag, status.MPI_TAG); MPI_Get_count(&status, MPI_DOUBLE, &i); if (i != count) printf ("Incorrect count, expected %d, got %d\n",count,i); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "rsend and recv"); } else { MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, requests); MPI_Send( MPI_BOTTOM, 0, MPI_INT, next, tag, MPI_COMM_WORLD); MPI_Wait(requests, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "rsend and recv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( ready_sends ); /* * Synchronous sends */ VT_BEGIN_REGION( sync_sends ); if (rank == 0) printf ("Ssend\n"); tag = 1789; count = TEST_SIZE / 3; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &status); if (flag) printf ("Iprobe succeeded! source %d, tag %d\n",status.MPI_SOURCE, status.MPI_TAG); MPI_Ssend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); while (!flag) MPI_Iprobe(MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &flag, &status); if (status.MPI_SOURCE != prev) printf ("Incorrect src, expected %d, got %d\n",prev, status.MPI_SOURCE); if (status.MPI_TAG != tag) printf ("Incorrect tag, expected %d, got %d\n",tag, status.MPI_TAG); MPI_Get_count(&status, MPI_DOUBLE, &i); if (i != count) printf ("Incorrect count, expected %d, got %d\n",count,i); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "ssend and recv"); } else { MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "ssend and recv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Ssend(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( sync_sends ); /* * Nonblocking normal sends */ VT_BEGIN_REGION( nblock_sends ); if (rank == 0) printf ("Isend\n"); tag = 2123; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, requests); init_test_data(send_buf,TEST_SIZE,0); MPI_Isend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, (requests+1)); MPI_Waitall(2, requests, statuses); rq_check( requests, 2, "isend and irecv" ); msg_check(recv_buf,prev,tag,count,statuses, TEST_SIZE,"isend and irecv"); } else { MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check(recv_buf,prev,tag,count,&status, TEST_SIZE,"isend and irecv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Isend(recv_buf, count, MPI_DOUBLE, next, tag,MPI_COMM_WORLD, (requests)); MPI_Wait((requests), &status); rq_check(requests, 1, "isend (and recv)"); } VT_END_REGION( nblock_sends ); /* * Nonblocking ready sends */ VT_BEGIN_REGION( nblock_rsends ); if (rank == 0) printf ("Irsend\n"); tag = 2456; count = TEST_SIZE / 3; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, requests); init_test_data(send_buf,TEST_SIZE,0); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, next, 0, MPI_BOTTOM, 0, MPI_INT, next, 0, dupcom, &status); MPI_Irsend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, (requests+1)); reqcount = 0; while (reqcount != 2) { MPI_Waitany( 2, requests, &index, statuses); if( index == 0 ) { memcpy( &status, statuses, sizeof(status) ); } reqcount++; } rq_check( requests, 1, "irsend and irecv"); msg_check(recv_buf,prev,tag,count,&status, TEST_SIZE,"irsend and irecv"); } else { MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, requests); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, next, 0, MPI_BOTTOM, 0, MPI_INT, next, 0, dupcom, &status); flag = 0; while (!flag) MPI_Test(requests, &flag, &status); rq_check( requests, 1, "irsend and irecv (test)"); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "irsend and irecv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Irsend(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, requests); MPI_Waitall(1, requests, statuses); rq_check( requests, 1, "irsend and irecv"); } VT_END_REGION( nblock_rsends ); /* * Nonblocking synchronous sends */ VT_BEGIN_REGION( nblock_ssends ); if (rank == 0) printf ("Issend\n"); tag = 2789; count = TEST_SIZE / 3; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { MPI_Irecv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, requests ); init_test_data(send_buf,TEST_SIZE,0); MPI_Issend(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, (requests+1)); flag = 0; while (!flag) MPI_Testall(2, requests, &flag, statuses); rq_check( requests, 2, "issend and irecv (testall)"); msg_check( recv_buf, prev, tag, count, statuses, TEST_SIZE, "issend and recv"); } else { MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "issend and recv"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Issend(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD,requests); flag = 0; while (!flag) MPI_Testany(1, requests, &index, &flag, statuses); rq_check( requests, 1, "issend and recv (testany)"); } VT_END_REGION( nblock_ssends ); /* * Persistent normal sends */ VT_BEGIN_REGION( pers_sends ); if (rank == 0) printf ("Send_init\n"); tag = 3123; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); MPI_Send_init(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, requests); MPI_Recv_init(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, (requests+1)); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); MPI_Startall(2, requests); MPI_Waitall(2, requests, statuses); msg_check( recv_buf, prev, tag, count, (statuses+1), TEST_SIZE, "persistent send/recv"); } else { MPI_Start((requests+1)); MPI_Wait((requests+1), &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "persistent send/recv"); init_test_data(send_buf,TEST_SIZE,1); MPI_Start(requests); MPI_Wait(requests, &status); } MPI_Request_free(requests); MPI_Request_free((requests+1)); VT_END_REGION( pers_sends ); /* * Persistent ready sends */ VT_BEGIN_REGION( pers_rsends ); if (rank == 0) printf ("Rsend_init\n"); tag = 3456; count = TEST_SIZE / 3; clear_test_data(recv_buf,TEST_SIZE); MPI_Rsend_init(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, requests); MPI_Recv_init(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, (requests+1)); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); MPI_Barrier( MPI_COMM_WORLD ); MPI_Startall(2, requests); reqcount = 0; while (reqcount != 2) { MPI_Waitsome(2, requests, &outcount, indices, statuses); for (i=0; i<outcount; i++) { if (indices[i] == 1) { msg_check( recv_buf, prev, tag, count, (statuses+i), TEST_SIZE, "waitsome"); } reqcount++; } } } else { MPI_Start((requests+1)); MPI_Barrier( MPI_COMM_WORLD ); flag = 0; while (!flag) MPI_Test((requests+1), &flag, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "test"); init_test_data(send_buf,TEST_SIZE,1); MPI_Start(requests); MPI_Wait(requests, &status); } MPI_Request_free(requests); MPI_Request_free((requests+1)); VT_END_REGION( pers_rsends ); /* * Persistent synchronous sends */ VT_BEGIN_REGION( pers_ssends ); if (rank == 0) printf ("Ssend_init\n"); tag = 3789; count = TEST_SIZE / 3; clear_test_data(recv_buf,TEST_SIZE); MPI_Ssend_init(send_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD, (requests+1)); MPI_Recv_init(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, requests); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); MPI_Startall(2, requests); reqcount = 0; while (reqcount != 2) { MPI_Testsome(2, requests, &outcount, indices, statuses); for (i=0; i<outcount; i++) { if (indices[i] == 0) { msg_check( recv_buf, prev, tag, count, (statuses+i), TEST_SIZE, "testsome"); } reqcount++; } } } else { MPI_Start(requests); flag = 0; while (!flag) MPI_Testany(1, requests, &index, &flag, statuses); msg_check( recv_buf, prev, tag, count, statuses, TEST_SIZE, "testany" ); init_test_data(send_buf,TEST_SIZE,1); MPI_Start((requests+1)); MPI_Wait((requests+1), &status); } MPI_Request_free(requests); MPI_Request_free((requests+1)); VT_END_REGION( pers_ssends ); /* * Send/receive. */ VT_BEGIN_REGION( sendrecv ); if (rank == 0) printf ("Sendrecv\n"); tag = 4123; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); MPI_Sendrecv(send_buf, count, MPI_DOUBLE, next, tag, recv_buf, count, MPI_DOUBLE, prev, tag, MPI_COMM_WORLD, &status ); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "sendrecv"); } else { MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "recv/send"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( sendrecv ); #ifdef V_T VT_flush(); #endif /* * Send/receive replace. */ VT_BEGIN_REGION( sendrecv_repl ); if (rank == 0) printf ("Sendrecv_replace\n"); tag = 4456; count = TEST_SIZE / 3; if (rank == 0) { init_test_data(recv_buf, TEST_SIZE,0); for (i=count; i< TEST_SIZE; i++) recv_buf[i] = 0.0; MPI_Sendrecv_replace(recv_buf, count, MPI_DOUBLE, next, tag, prev, tag, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "sendrecvreplace"); } else { clear_test_data(recv_buf,TEST_SIZE); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); msg_check( recv_buf, prev, tag, count, &status, TEST_SIZE, "recv/send for replace"); init_test_data(recv_buf,TEST_SIZE,1); MPI_Send(recv_buf, count, MPI_DOUBLE, next, tag, MPI_COMM_WORLD); } VT_END_REGION( sendrecv_repl ); /* * Send/Receive via inter-communicator */ VT_BEGIN_REGION( intercomm ); MPI_Intercomm_create(MPI_COMM_SELF, 0, MPI_COMM_WORLD, next, 1, &intercom); if (rank == 0) printf ("Send via inter-communicator\n"); tag = 4018; count = TEST_SIZE / 5; clear_test_data(recv_buf,TEST_SIZE); if (rank == 0) { init_test_data(send_buf,TEST_SIZE,0); LOCDEF(); MPI_Send(send_buf, count, MPI_DOUBLE, 0, tag, intercom); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, intercom, &status); msg_check(recv_buf, 0, tag, count, &status, TEST_SIZE, "send and recv via inter-communicator"); } else if (rank == 1) { LOCDEF(); MPI_Recv(recv_buf, TEST_SIZE, MPI_DOUBLE,MPI_ANY_SOURCE, MPI_ANY_TAG, intercom, &status); msg_check( recv_buf, 0, tag, count, &status, TEST_SIZE,"send and recv via inter-communicator"); init_test_data(recv_buf,TEST_SIZE,0); MPI_Send(recv_buf, count, MPI_DOUBLE, 0, tag, intercom); } VT_END_REGION( normal_sends ); MPI_Comm_free(&intercom); MPI_Comm_free(&dupcom); }
int main (int argc, char **argv) { FILE *fp; double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL; double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL; int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size; int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size; int rank, size, sqrt_size, matrices_a_b_dimensions[4]; MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator; MPI_Status status; double reading_time, dimensions_time, scatter_time, gather_time, writing_time; // used to manage the cartesian grid int dimensions[2], periods[2], coordinates[2], remain_dims[2]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* For square mesh */ sqrt_size = (int)sqrt((double) size); if(sqrt_size * sqrt_size != size){ if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n"); MPI_Abort(MPI_COMM_WORLD, -1); } // create a 2D cartesian grid dimensions[0] = dimensions[1] = sqrt_size; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator); MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); // create a row communicator remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator); // create a column communicator remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator); //Start I/O reading counter! reading_time = MPI_Wtime(); // getting matrices from files at rank 0 only // example: mpiexec -n 64 ./cannon matrix1 matrix2 [test] if (rank == 0){ int row, column; if ((fp = fopen (argv[1], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]); A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *)); for (row = 0; row < matrices_a_b_dimensions[0]; row++){ A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double)); for (column = 0; column < matrices_a_b_dimensions[1]; column++) fscanf(fp, "%lf", &A[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, -1); } if((fp = fopen (argv[2], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]); B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *)); for(row = 0; row < matrices_a_b_dimensions[2]; row++){ B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *)); for(column = 0; column < matrices_a_b_dimensions[3]; column++) fscanf(fp, "%lf", &B[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // need to check that the multiplication is possible given dimensions // matrices_a_b_dimensions[0] = row size of A // matrices_a_b_dimensions[1] = column size of A // matrices_a_b_dimensions[2] = row size of B // matrices_a_b_dimensions[3] = column size of B if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){ if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // this implementation is limited to cases where thematrices can be partitioned perfectly if( matrices_a_b_dimensions[0] % sqrt_size != 0 || matrices_a_b_dimensions[1] % sqrt_size != 0 || matrices_a_b_dimensions[2] % sqrt_size != 0 || matrices_a_b_dimensions[3] % sqrt_size != 0 ){ if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n" "all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n", matrices_a_b_dimensions[0],matrices_a_b_dimensions[1], matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size ); MPI_Abort(MPI_COMM_WORLD, -1); } } //stop I/O reading counter reading_time = MPI_Wtime() - reading_time; //Start dimensions sending counter dimensions_time = MPI_Wtime(); // send dimensions to all peers if(rank == 0) { int i; for(i = 1; i < size; i++){ MPI_Send(matrices_a_b_dimensions, 4, MPI_INT, i, 0, cartesian_grid_communicator); } } else { MPI_Recv(matrices_a_b_dimensions, 4, MPI_INT, 0, 0, cartesian_grid_communicator, &status); } //stop dimensions sending counter dimensions_time = MPI_Wtime() - dimensions_time; A_rows = matrices_a_b_dimensions[0]; A_columns = matrices_a_b_dimensions[1]; B_rows = matrices_a_b_dimensions[2]; B_columns = matrices_a_b_dimensions[3]; // local metadata for A A_local_block_rows = A_rows / sqrt_size; A_local_block_columns = A_columns / sqrt_size; A_local_block_size = A_local_block_rows * A_local_block_columns; A_local_block = (double *) malloc (A_local_block_size * sizeof(double)); // local metadata for B B_local_block_rows = B_rows / sqrt_size; B_local_block_columns = B_columns / sqrt_size; B_local_block_size = B_local_block_rows * B_local_block_columns; B_local_block = (double *) malloc (B_local_block_size * sizeof(double)); // local metadata for C C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double)); // C needs to be initialized at 0 (accumulates partial dot-products) int i; for(i=0; i < A_local_block_rows * B_local_block_columns; i++){ C_local_block[i] = 0; } //Start data scattering counter scatter_time = MPI_Wtime(); // full arrays only needed at root if(rank == 0){ A_array = (double *) malloc(sizeof(double) * A_rows * A_columns); B_array = (double *) malloc(sizeof(double) * B_rows * B_columns); C_array = (double *) malloc(sizeof(double) * A_rows * B_columns); // generate the 1D arrays of the matrices at root int row, column, i, j; for (i = 0; i < sqrt_size; i++){ for (j = 0; j < sqrt_size; j++){ for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < A_local_block_columns; column++){ A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = A[i * A_local_block_rows + row][j * A_local_block_columns + column]; } } for (row = 0; row < B_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = B[i * B_local_block_rows + row][j * B_local_block_columns + column]; } } } } // allocate output matrix C C = (double **) malloc(A_rows * sizeof(double *)); for(i=0; i<A_rows ;i++){ C[i] = (double *) malloc(B_columns * sizeof(double)); } } // send a block to each process if(rank == 0) { int i; for(i = 1; i < size; i++){ MPI_Send((A_array + (i * A_local_block_size)), A_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator); MPI_Send((B_array + (i * B_local_block_size)), B_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator); } for(i = 0; i < A_local_block_size; i++){ A_local_block[i] = A_array[i]; } for(i = 0; i < B_local_block_size; i++){ B_local_block[i] = B_array[i]; } } else { MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status); MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status); } // fix initial arrangements before the core algorithm starts if(coordinates[0] != 0){ MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0, (coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status); } if(coordinates[1] != 0){ MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0, (coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status); } //stop data scattering counter scatter_time = MPI_Wtime() - scatter_time; // cannon's algorithm int cannon_block_cycle; double compute_time = 0, mpi_time = 0, start; int C_index, A_row, A_column, B_column; for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){ // compute partial result for this block cycle start = MPI_Wtime(); for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){ for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){ for(A_column = 0; A_column < A_local_block_columns; A_column++){ C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * B_local_block[A_column * B_local_block_columns + B_column]; } } } compute_time += MPI_Wtime() - start; start = MPI_Wtime(); // rotate blocks horizontally MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - 1) % sqrt_size, 0, (coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status); // rotate blocks vertically MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - 1) % sqrt_size, 0, (coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status); mpi_time += MPI_Wtime() - start; } //Start data gathering counter gather_time = MPI_Wtime(); // get C parts from other processes at rank 0 if(rank == 0) { for(i = 0; i < A_local_block_rows * B_local_block_columns; i++){ C_array[i] = C_local_block[i]; } int i; for(i = 1; i < size; i++){ MPI_Recv(C_array + (i * A_local_block_rows * B_local_block_columns), A_local_block_rows * B_local_block_columns, MPI_DOUBLE, i, 0, cartesian_grid_communicator, &status); } } else { MPI_Send(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, 0, cartesian_grid_communicator); } //stop data gathering counter gather_time = MPI_Wtime() - gather_time; // generating output at rank 0 if (rank == 0) { //Start I/O writing counter writing_time = MPI_Wtime(); // convert the ID array into the actual C matrix int i, j, k, row, column; char output_filename[50]; sprintf(output_filename,"output%dx%d_%d.out",A_rows,B_columns,atoi(argv[3])); FILE *fp; if((fp = fopen(output_filename, "wb")) == NULL) { perror("File cannot be opened"); exit(1); } fwrite(C_array, sizeof(double), A_rows*B_columns, fp); fclose(fp); //stop I/O writing counter writing_time = MPI_Wtime() - writing_time; //Print metrics printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns); printf("Computation time: %lf\n", compute_time); printf("MPI time: %lf\n", mpi_time); printf("Reading time: %lf\n", reading_time); printf("Dimensions time: %lf\n", dimensions_time); printf("Scattering time: %lf\n", scatter_time); printf("Gathering time: %lf\n", gather_time); printf("Writing time: %lf\n", writing_time); printf("Total non-computational MPI time: %lf\n", dimensions_time + scatter_time + gather_time); printf("Total IO time: %lf\n", reading_time + writing_time); if (argc == 5){ // present results on the screen printf("\nA( %d x %d ):\n", A_rows, A_columns); for(row = 0; row < A_rows; row++) { for(column = 0; column < A_columns; column++) printf ("%7.3f ", A[row][column]); printf ("\n"); } printf("\nB( %d x %d ):\n", B_rows, B_columns); for(row = 0; row < B_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ", B[row][column]); printf("\n"); } printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns); for(row = 0; row < A_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ",C[row][column]); printf("\n"); } printf("\nPerforming serial consistency check. Be patient...\n"); fflush(stdout); int pass = 1; double temp; for(i=0; i<A_rows; i++){ for(j=0; j<B_columns; j++){ temp = 0; for(k=0; k<B_rows; k++){ temp += A[i][k] * B[k][j]; } printf("%7.3f ", temp); if(temp != C[i][j]){ pass = 0; } } printf("\n"); } if (pass) printf("Consistency check: PASS\n"); else printf("Consistency check: FAIL\n"); } } // free all memory if(rank == 0){ int i; for(i = 0; i < A_rows; i++){ free(A[i]); } for(i = 0; i < B_rows; i++){ free(B[i]); } for(i = 0; i < A_rows; i++){ free(C[i]); } free(A); free(B); free(C); free(A_array); free(B_array); free(C_array); } free(A_local_block); free(B_local_block); free(C_local_block); // finalize MPI MPI_Finalize(); }
int main(int argc, char **argv) { int rank, num_tasks; /* Initialize MPI */ #if USE_MPI MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // printf("Hello world from rank %3d of %3d\n", rank, num_tasks); #else rank = 0; num_tasks = 1; #endif if (argc != 2) { if (rank == 0) { fprintf(stderr, "%s <n>\n", argv[0]); fprintf(stderr, "Program for parallel dense matrix-matrix multiplication\n"); fprintf(stderr, "with 1D row partitioning\n"); fprintf(stderr, "<n>: matrix dimension (an nxn dense matrix is created)\n"); #if USE_MPI MPI_Abort(MPI_COMM_WORLD, 1); #else exit(1); #endif } } int n; n = atoi(argv[1]); assert(n > 0); assert(n < 10000); /* ensure that n is a multiple of num_tasks */ n = (n/num_tasks) * num_tasks; int n_p = (n/num_tasks); /* print new n to let user know n has been modified */ if (rank == 0) { fprintf(stderr, "n: %d, n_p: %d, num_tasks: %d\n", n, n_p, num_tasks); fprintf(stderr, "Requires %3.6lf MB of memory per task\n", ((3*4.0*n_p)*n/1e6)); } float *A, *B, *C; A = (float *) malloc(n_p * n * sizeof(float)); assert(A != 0); B = (float *) malloc(n_p * n * sizeof(float)); assert(A != 0); C = (float *) malloc(n_p * n * sizeof(float)); assert(C != 0); /* linearized matrices in row-major storage */ /* A[i][j] would be A[i*n+j] */ int i, j; /* static initalization, so that we can verify output */ /* using very simple initialization right now */ /* this isn't a good check for parallel debugging */ #ifdef _OPENMP #pragma omp parallel for private(i,j) #endif for (i=0; i<n_p; i++) { for (j=0; j<n; j++) { A[i*n+j] = (rank+1); B[i*n+j] = 1; C[i*n+j] = 0; } } #if USE_MPI MPI_Barrier(MPI_COMM_WORLD); #endif double elt = 0.0, commStart = 0.0, commEnd = 0.0, commTime = 0.0, totalCommTime = 0.0, tempTime; if (rank == 0) elt = timer(); #if USE_MPI /* Parallel matmul code goes here, see lecture slides for idea */ /* The matrix C should be updated correctly */ // precalculate some variables to use in the loop int dest=rank+1, src=rank-1, numElemPerProc = n_p * n; // mark the beginning of column indices int colStart = rank * n_p; int nr, k; for(nr = 0; nr < num_tasks; nr++) { // wrap around colStart if(colStart < 0) colStart = n - n_p; // do the actual matrix multiplication on each proc's data for(i = 0; i < n_p; i++) { int iC = i * n; int iA = iC + colStart; for(j = 0; j < n; j++) { float result = 0; for(k = 0; k < n_p; k++) result += A[iA + k] * B[k*n + j]; // attempt at loop unrolling with re-association //for(k = 0; k < n_p; k+=2) // result += (A[iA + k] * B[k*n + j]) + (A[iA + k + 1] * B[(k+1)*n + j]); //// include the remaining elements now //for(; k < n_p; k++) // result += (A[iA + k] * B[k*n + j]); C[iC + j] += result; } // end of j loop } // end of i loop // wrap around indices in case of edge conditions if(rank == 0) src = num_tasks - 1; else if (rank == num_tasks - 1) dest = 0; commStart = timer(); // use SendRecv replace to perform a Send first to higher rank proc // and then Receive from a lower rank proc // Cyclic transfers of chunks of B's data MPI_Sendrecv_replace(B, numElemPerProc, MPI_FLOAT, dest, 123, src, 123, MPI_COMM_WORLD, MPI_STATUS_IGNORE); commEnd = timer(); commTime = commEnd - commStart; // calculate the max comm time since that will be the limiter? MPI_Reduce(&commTime, &tempTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); // only the root process accumulates the total communication time if(!rank) totalCommTime += tempTime; // adjust colStart since we have moved on to the // previous row, so colStart needs to be decremented // and wrapped around colStart -= n_p; } // end of nr for loop #else int k; #ifdef _OPENMP #pragma omp parallel for private(i,j,k) #endif for (i=0; i<n_p; i++) { for (j=0; j<n; j++) { float c_ij = 0; for (k=0; k<n; k++) { c_ij += A[i*n+k]*B[k*n+j]; } C[i*n+j] = c_ij; } } #endif if (rank == 0) elt = timer() - elt; /* Verify */ int verify_failed = 0; for (i=0; i<n_p; i++) { for (j=0; j<n; j++) { if (C[i*n+j] != ((rank+1)*n)) verify_failed = 1; } } if (verify_failed) { fprintf(stderr, "ERROR: rank %d, verification failed, exiting!\n", rank); #if USE_MPI MPI_Abort(MPI_COMM_WORLD, 2); #else exit(2); #endif } if (rank == 0) { fprintf(stderr, "Time taken: %3.3lf s.\n", elt); fprintf(stderr, "Performance: %3.3lf GFlop/s\n", (2.0*n*n)*n/(elt*1e9)); fprintf(stderr, "Communication time: %3.3lf s.\n", totalCommTime); fprintf(stderr, "Computation time must be: %3.3lf s.\n", elt - totalCommTime); } /* free memory */ free(A); free(B); free(C); /* Shut down MPI */ #if USE_MPI MPI_Finalize(); #endif return 0; }