/* Extract an m x n submatrix within an m x N matrix and transpose it. Assume storage by rows; the defined datatype accesses by columns */ MPI_Datatype transpose_type(int N, int m, int n, MPI_Datatype type) /* computes a datatype for the transpose of an mxn matrix with entries of type type */ { MPI_Datatype subrow, subrow1, submatrix; MPI_Aint lb, extent; MPI_Type_vector(m, 1, N, type, &subrow); MPI_Type_get_extent(type, &lb, &extent); MPI_Type_create_resized(subrow, 0, extent, &subrow1); MPI_Type_contiguous(n, subrow1, &submatrix); MPI_Type_commit(&submatrix); MPI_Type_free( &subrow ); MPI_Type_free( &subrow1 ); /* Add a consistency test: the size of submatrix should be n * m * sizeof(type) and the extent should be ((m-1)*N+n) * sizeof(type) */ { int tsize; MPI_Aint textent, llb; MPI_Type_size( type, &tsize ); MPI_Type_get_true_extent( submatrix, &llb, &textent ); if (textent != tsize * (N * (m-1)+n)) { fprintf( stderr, "Transpose Submatrix extent is %ld, expected %ld (%d,%d,%d)\n", (long)textent, (long)(tsize * (N * (m-1)+n)), N, n, m ); } } return(submatrix); }
double* partition_matrix(double *a, int N, int gd, MPI_Datatype *type_block) { MPI_Datatype type_block_tmp; int NB = N/gd; double* b = malloc(NB*NB*sizeof(double)); MPI_Type_vector(NB, NB, N, MPI_DOUBLE, &type_block_tmp); MPI_Type_create_resized(type_block_tmp, 0, sizeof(double), type_block); MPI_Type_commit(type_block); int counts[gd*gd]; int disps[gd*gd]; for (int i=0; i<gd; i++) { for (int j=0; j<gd; j++) { disps[i*gd+j] = i*N*NB+j*NB; counts [i*gd+j] = 1; } } MPI_Scatterv(a, counts, disps, *type_block, b, NB*NB, MPI_DOUBLE, 0, MPI_COMM_WORLD); return b; }
static MPI_Datatype create_indexed_gap_optimized_ddt( void ) { MPI_Datatype dt1, dt2, dt3; int bLength[3]; MPI_Datatype types[3]; MPI_Aint displ[3]; MPI_Type_contiguous( 40, MPI_BYTE, &dt1 ); MPI_Type_create_resized( dt1, 0, 44, &dt2 ); bLength[0] = 4; bLength[1] = 9; bLength[2] = 36; types[0] = MPI_BYTE; types[1] = dt2; types[2] = MPI_BYTE; displ[0] = 0; displ[1] = 8; displ[2] = 44 * 9 + 8; MPI_Type_create_struct( 3, bLength, displ, types, &dt3 ); MPI_Type_free( &dt1 ); MPI_Type_free( &dt2 ); MPI_DDT_DUMP( dt3 ); MPI_Type_commit( &dt3 ); return dt3; }
static PetscErrorCode MakeDatatype(MPI_Datatype *dtype) { PetscErrorCode ierr; MPI_Datatype dtypes[3],tmptype; PetscMPIInt lengths[3]; MPI_Aint displs[3]; Unit dummy; PetscFunctionBegin; dtypes[0] = MPIU_INT; dtypes[1] = MPIU_SCALAR; dtypes[2] = MPI_CHAR; lengths[0] = 1; lengths[1] = 1; lengths[2] = 3; /* Curse the evil beings that made std::complex a non-POD type. */ displs[0] = (char*)&dummy.rank - (char*)&dummy; /* offsetof(Unit,rank); */ displs[1] = (char*)&dummy.value - (char*)&dummy; /* offsetof(Unit,value); */ displs[2] = (char*)&dummy.ok - (char*)&dummy; /* offsetof(Unit,ok); */ ierr = MPI_Type_create_struct(3,lengths,displs,dtypes,&tmptype);CHKERRQ(ierr); ierr = MPI_Type_commit(&tmptype);CHKERRQ(ierr); ierr = MPI_Type_create_resized(tmptype,0,sizeof(Unit),dtype);CHKERRQ(ierr); ierr = MPI_Type_commit(dtype);CHKERRQ(ierr); ierr = MPI_Type_free(&tmptype);CHKERRQ(ierr); { MPI_Aint lb,extent; ierr = MPI_Type_get_extent(*dtype,&lb,&extent);CHKERRQ(ierr); if (extent != sizeof(Unit)) SETERRQ2(PETSC_COMM_WORLD,PETSC_ERR_LIB,"New type has extent %d != sizeof(Unit) %d",extent,(int)sizeof(Unit)); } PetscFunctionReturn(0); }
void distribute_matrix(ATYPE *root_matrix, ATYPE *local_matrix, int local_rank, int proc_size, long partition, uint N){ int sendcounts[proc_size], displs[proc_size]; ATYPE *sendbuffer=NULL; MPI_Datatype MPI_type, MPI_type2; int rest = N - (partition * ( proc_size - 1) ); MPI_Type_vector(N, 1, N, ATYPE_MPI, &MPI_type2); MPI_Type_create_resized( MPI_type2, 0, sizeof(ATYPE), &MPI_type); MPI_Type_commit(&MPI_type); for ( int i=0 ; i<proc_size ; ++i ){ if ( i == proc_size - 1 ) { sendcounts[i] = rest; } else { sendcounts[i] = partition; } displs[i] = i*partition; } if ( local_rank == root ) sendbuffer = &(root_matrix[0]); MPI_Scatterv( sendbuffer, sendcounts, displs, MPI_type, &(local_matrix[0]), partition*N, ATYPE_MPI, root, MPI_COMM_WORLD ); MPI_Type_free(&MPI_type); }
static PetscErrorCode MatStashBlockTypeSetUp(MatStash *stash) { PetscErrorCode ierr; PetscFunctionBegin; if (stash->blocktype == MPI_DATATYPE_NULL) { PetscInt bs2 = PetscSqr(stash->bs); PetscMPIInt blocklens[2]; MPI_Aint displs[2]; MPI_Datatype types[2],stype; /* C++ std::complex is not my favorite datatype. Since it is not POD, we cannot use offsetof to find the offset of * vals. But the layout is actually guaranteed by the standard, so we do a little dance here with struct * DummyBlock, substituting PetscReal for PetscComplex so that we can determine the offset. */ struct DummyBlock {PetscInt row,col; PetscReal vals;}; stash->blocktype_size = offsetof(struct DummyBlock,vals) + bs2*sizeof(PetscScalar); if (stash->blocktype_size % sizeof(PetscInt)) { /* Implies that PetscInt is larger and does not satisfy alignment without padding */ stash->blocktype_size += sizeof(PetscInt) - stash->blocktype_size % sizeof(PetscInt); } ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segsendblocks);CHKERRQ(ierr); ierr = PetscSegBufferCreate(stash->blocktype_size,1,&stash->segrecvblocks);CHKERRQ(ierr); ierr = PetscSegBufferCreate(sizeof(MatStashFrame),1,&stash->segrecvframe);CHKERRQ(ierr); blocklens[0] = 2; blocklens[1] = bs2; displs[0] = offsetof(struct DummyBlock,row); displs[1] = offsetof(struct DummyBlock,vals); types[0] = MPIU_INT; types[1] = MPIU_SCALAR; ierr = MPI_Type_create_struct(2,blocklens,displs,types,&stype);CHKERRQ(ierr); ierr = MPI_Type_commit(&stype);CHKERRQ(ierr); ierr = MPI_Type_create_resized(stype,0,stash->blocktype_size,&stash->blocktype);CHKERRQ(ierr); /* MPI-2 */ ierr = MPI_Type_commit(&stash->blocktype);CHKERRQ(ierr); ierr = MPI_Type_free(&stype);CHKERRQ(ierr); }
/* derived_resized_test() * * Tests behavior with resizing of a simple derived type. * * Returns the number of errors encountered. */ int derived_resized_test(void) { int err, errs = 0; int count = 2; MPI_Datatype newtype, resizedtype; int size; MPI_Aint extent; err = MPI_Type_contiguous(count, MPI_INT, &newtype); if (err != MPI_SUCCESS) { if (verbose) { fprintf(stderr, "error creating type in derived_resized_test()\n"); } errs++; } err = MPI_Type_create_resized(newtype, (MPI_Aint) 0, (MPI_Aint) (2 * sizeof(int) + 10), &resizedtype); err = MPI_Type_size(resizedtype, &size); if (err != MPI_SUCCESS) { if (verbose) { fprintf(stderr, "error obtaining type size in derived_resized_test()\n"); } errs++; } if (size != 2 * sizeof(int)) { if (verbose) { fprintf(stderr, "error: size != %d in derived_resized_test()\n", (int) (2 * sizeof(int))); } errs++; } err = MPI_Type_extent(resizedtype, &extent); if (err != MPI_SUCCESS) { if (verbose) { fprintf(stderr, "error obtaining type extent in derived_resized_test()\n"); } errs++; } if (extent != 2 * sizeof(int) + 10) { if (verbose) { fprintf(stderr, "error: invalid extent (%d) in derived_resized_test(); should be %d\n", (int) extent, (int) (2 * sizeof(int) + 10)); } errs++; } MPI_Type_free(&newtype); MPI_Type_free(&resizedtype); return errs; }
JNIEXPORT jlong JNICALL Java_mpi_Datatype_getResized( JNIEnv *env, jclass clazz, jlong oldType, jint lb, jint extent) { MPI_Datatype type; int rc = MPI_Type_create_resized((MPI_Datatype)oldType, lb, extent, &type); ompi_java_exceptionCheck(env, rc); return (jlong)type; }
// Function to create and commit MPI datatypes // Each datatype is resized to size of float, it looks like this fixes // segmentation fault issues. void create_types() { MPI_Datatype border_row_t0; MPI_Type_contiguous(local_width, // count MPI_FLOAT, // old_type &border_row_t0); // newtype_p MPI_Type_create_resized(border_row_t0, 0, sizeof(float), &border_row_t); MPI_Type_commit(&border_row_t); MPI_Datatype border_col_t0; MPI_Type_vector(local_height, // count 1, // blocklength local_width + 2, // stride MPI_FLOAT, // old_type &border_col_t0); // newtype_p MPI_Type_create_resized(border_col_t0, 0, sizeof(float), &border_col_t); MPI_Type_commit(&border_col_t); MPI_Datatype pres_and_diverg_t0; MPI_Type_vector(local_height, // count local_width, // blocklength imageSize + 2, // stride MPI_FLOAT, // old_type &pres_and_diverg_t0); // newtype_p MPI_Type_create_resized(pres_and_diverg_t0, 0, sizeof(float), &pres_and_diverg_t); MPI_Type_commit(&pres_and_diverg_t); MPI_Datatype local_diverg_t0; MPI_Type_vector(local_height, // count local_width, // blocklength local_width, // stride MPI_FLOAT, // old_type &local_diverg_t0); // newtype_p MPI_Type_create_resized(local_diverg_t0, 0, sizeof(float), &local_diverg_t); MPI_Type_commit(&local_diverg_t); MPI_Datatype local_pres_t0; MPI_Type_vector(local_height, // count local_width, // blocklength local_width + 2, // stride MPI_FLOAT, // old_type &local_pres_t0); // newtype_p MPI_Type_create_resized(local_pres_t0, 0, sizeof(float), &local_pres_t); MPI_Type_commit(&local_pres_t); }
/*--------------------------------------------------------------------- * Function: Build_cyclic_mpi_type * Purpose: Build an MPI derived datatype that can be used with * cyclically distributed data. * In arg: * loc_n: The number of elements assigned to each process * Global out: * cyclic_mpi_t: An MPI datatype that can be used with cyclically * distributed data */ void Build_cyclic_mpi_type(int loc_n) { MPI_Datatype temp_mpi_t; MPI_Aint lb, extent; MPI_Type_vector(loc_n, 1, comm_sz, MPI_INT, &temp_mpi_t); MPI_Type_get_extent(MPI_INT, &lb, &extent); MPI_Type_create_resized(temp_mpi_t, lb, extent, &cyclic_mpi_t); MPI_Type_commit(&cyclic_mpi_t); } /* Build_cyclic_mpi_type */
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int p, rank; MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &rank); char i; char a[ROWS*COLS]; int NPROWS=2; int NPCOLS=3; int BLOCKROWS = ROWS/NPROWS; int BLOCKCOLS = COLS/NPCOLS; if (rank == 0) { for (int ii=0; ii<ROWS*COLS; ii++) { a[ii] = ii; } } if (p != NPROWS*NPCOLS) { fprintf(stderr,"Error: number of PEs %d != %d x %d\n", p, NPROWS, NPCOLS); MPI_Finalize(); exit(-1); } char b[BLOCKROWS*BLOCKCOLS]; for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++) b[ii] = 0; MPI_Datatype blocktype; MPI_Datatype blocktype2; MPI_Type_vector(BLOCKROWS, BLOCKCOLS, COLS, MPI_CHAR, &blocktype2); MPI_Type_create_resized( blocktype2, 0, sizeof(char), &blocktype); MPI_Type_commit(&blocktype); int disps[NPROWS*NPCOLS]; int counts[NPROWS*NPCOLS]; for (int ii=0; ii<NPROWS; ii++) { for (int jj=0; jj<NPCOLS; jj++) { disps[ii*NPCOLS+jj] = ii*COLS*BLOCKROWS+jj*BLOCKCOLS; counts [ii*NPCOLS+jj] = 1; } } MPI_Finalize(); return 0; }
void distribute_matrix(double **matrix, double *global_mat_ptr, int *sendCounts, int *displs, int * global_size, int *local_size) { int start[2]= {0,0}; double *local_ptr =&(matrix[0][0]); MPI_Datatype subType; MPI_Datatype type; MPI_Type_create_subarray(2, global_size, local_size, start, MPI_ORDER_C, MPI_DOUBLE, &subType); MPI_Type_create_resized(subType, 0, local_size[1]*sizeof(double), &type); MPI_Type_commit(&type); MPI_Scatterv(global_mat_ptr, sendCounts, displs, type, local_ptr, local_size[0]*local_size[1], MPI_DOUBLE, 0, MPI_COMM_WORLD); MPI_Type_free(&type); }
void gather_submatrices(double **matrix, double *global_mat_ptr, int *sendCounts, int *displs, int global_a_row, int *local_size, int my_rank) { int start[2]= {0,0}, global_size[2]= {global_a_row,global_a_row}; double *local_ptr =&(matrix[0][0]); if (my_rank == 0) MPI_Datatype subType; MPI_Datatype type; MPI_Type_create_subarray(2, global_size, local_size, start, MPI_ORDER_C, MPI_DOUBLE, &subType); MPI_Type_create_resized(subType, 0, local_size[1]*sizeof(double), &type); MPI_Type_commit(&type); //printf("Global 0 : %d global 1 : %d local 0 : %d local 1 : %d\n",global_size[0],global_size[1],local_size[0], local_size[1]); MPI_Gatherv(local_ptr, local_size[0]*local_size[1],MPI_DOUBLE,global_mat_ptr, sendCounts, displs, type, 0, MPI_COMM_WORLD); MPI_Type_free(&type); }
void avtWholeImageCompositerWithZ::InitializeMPIStuff(void) { #define UCH MPI_UNSIGNED_CHAR #define FLT MPI_FLOAT int lengths[] = { 1, 1, 1, 1}; MPI_Aint displacements[] = { 0, 0, 0, 0}; MPI_Datatype types[] = {FLT, UCH, UCH, UCH}; ZFPixel_t onePixel; #undef UCH #undef FLT // create the MPI data type for ZFPixel MPI_Address(&onePixel.z, &displacements[0]); MPI_Address(&onePixel.r, &displacements[1]); MPI_Address(&onePixel.g, &displacements[2]); MPI_Address(&onePixel.b, &displacements[3]); for (int i = 3; i >= 0; --i) displacements[i] -= displacements[0]; MPI_Type_create_struct(4, lengths, displacements, types, &avtWholeImageCompositerWithZ::mpiTypeZFPixel); // check that the datatype has the correct extent MPI_Aint ext; MPI_Type_extent(avtWholeImageCompositerWithZ::mpiTypeZFPixel, &ext); if (ext != sizeof(onePixel)) { MPI_Datatype tmp = avtWholeImageCompositerWithZ::mpiTypeZFPixel; MPI_Type_create_resized(tmp, 0, sizeof(ZFPixel_t), &avtWholeImageCompositerWithZ::mpiTypeZFPixel); MPI_Type_free(&tmp); } MPI_Type_commit(&avtWholeImageCompositerWithZ::mpiTypeZFPixel); MPI_Op_create((MPI_User_function *)MergeZFPixelBuffers, 1, &avtWholeImageCompositerWithZ::mpiOpMergeZFPixelBuffers); }
/** * Returns the MPI_Datatype for `MyStruct`. * * TODO: You have to implement this function here: */ MPI_Datatype mystruct_get_mpi_type() { // use MPI commands to create a custom data type for MyStruct MPI_Datatype type, tmp_type; // TODO: create the MPI datatype for MyStruct MyStruct x; MPI_Aint base, adr_key, adr_d, adr_e; MPI_Get_address(&x, &base); MPI_Get_address(&x.key, &adr_key); MPI_Get_address(&x.d, &adr_d); MPI_Get_address(&x.e[0], &adr_e); MPI_Aint disps[3] = {adr_key - base, adr_d - base, adr_e - base}; MPI_Aint extent = sizeof(x); int blens[3] = {1, 1, 4}; MPI_Datatype types[3] = {MPI_UNSIGNED, MPI_DOUBLE, MPI_CHAR}; MPI_Type_create_struct(3, blens, disps, types, &tmp_type); MPI_Type_create_resized(tmp_type, 0, extent, &type); MPI_Type_commit(&type); return type; }
int matrix_placement_proc(int nb_proc_row, int nb_in_block, MPI_Comm* comm, int* sendbuf, int* rcvbuf, enum arrangement type, int ldnblc){ MPI_Datatype blocktype; MPI_Datatype blocktype2; MPI_Datatype blocktype3; int ii, jj; // first you create the type representation for a matrix bloc associated to a process MPI_Type_vector(nb_in_block, nb_in_block, nb_in_block*nb_proc_row, MPI_INT, &blocktype2); MPI_Type_create_resized(blocktype2, 0, sizeof(int), &blocktype); MPI_Type_commit(&blocktype); MPI_Type_vector(nb_in_block, nb_in_block, ldnblc, MPI_INT, &blocktype3); MPI_Type_commit(&blocktype3); int disps[nb_proc_row*nb_proc_row]; int counts[nb_proc_row*nb_proc_row]; for (ii=0; ii<nb_proc_row; ii++) { for (jj=0; jj<nb_proc_row; jj++) { disps[ii*nb_proc_row+jj] = ii*nb_in_block*nb_in_block*nb_proc_row+jj*nb_in_block; counts [ii*nb_proc_row+jj] = 1; } } // scatter or gather if (type == SCATTER) MPI_Scatterv(sendbuf, counts, disps, blocktype, rcvbuf, 1, blocktype3, 0, *comm); else if (type == GATHER) MPI_Gatherv(sendbuf, 1, blocktype3, rcvbuf, counts, disps, blocktype, 0, *comm); MPI_Type_free(&blocktype); MPI_Type_free(&blocktype3); if (type != GATHER && type != SCATTER) return EXIT_FAILURE; return EXIT_SUCCESS; }
/* * Construct a sub array type for scatter and gather data * Reference: http://stackoverflow.com/questions/9269399/sending-blocks-of-2d-array-in-c-using-mpi/9271753#9271753 */ void init_subarrtype(int root, int me, int n, int dim_sz, int per_n, MPI_Datatype* subarrtype_addr, int sendcounts[], int displs[]) { int sizes[2] = {n, n}; /* global size */ int subsizes[2] = {per_n, per_n}; /* local size */ int starts[2] = {0,0}; /* where this one starts */ MPI_Datatype type; mpi_check(MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &type)); mpi_check(MPI_Type_create_resized(type, 0, per_n*sizeof(double), subarrtype_addr)); mpi_check(MPI_Type_commit(subarrtype_addr)); int i,j; if(me == root) { for (i=0; i< dim_sz*dim_sz; i++) { sendcounts[i] = 1; } int disp = 0; for (i=0; i<dim_sz; i++) { for (j=0; j<dim_sz; j++) { displs[i*dim_sz+j] = disp; disp += 1; } disp += (per_n-1)*dim_sz; } } }
int main( int argc, char *argv[] ) { int errs = 0, i; int rank, size, source, dest; int count; int *buf; MPI_Comm comm; MPI_Status status; MPI_Datatype newtype; MTest_Init( &argc, &argv ); comm = MPI_COMM_WORLD; /* Determine the sender and receiver */ MPI_Comm_rank( comm, &rank ); MPI_Comm_size( comm, &size ); source = 0; dest = size - 1; /* Create an type that is "* INT * " that is, there is a int-sized pad at the beginning of the type, and the extent is still 3 ints. Note, however, that the INT is still at displacement 0, so the effective pattern i*/ MPI_Type_create_resized( MPI_INT, -(int)sizeof(int), 3 * sizeof(int), &newtype ); MPI_Type_commit( &newtype ); for (count = 1; count < 65000; count = count * 2) { buf = (int *)malloc( count * 3 * sizeof(int) ); if (!buf) { MPI_Abort( comm, 1 ); exit(1); } for (i=0; i<3*count; i++) buf[i] = -1; if (rank == source) { for (i=0; i<count; i++) buf[3*i] = i; MPI_Send( buf, count, newtype, dest, 0, comm ); MPI_Send( buf, count, newtype, dest, 1, comm ); } else if (rank == dest) { MPI_Recv( buf, count, MPI_INT, source, 0, comm, &status ); for (i=0; i<count; i++) { if (buf[i] != i) { errs++; if (errs < 10) { printf( "buf[%d] = %d\n", i, buf[i] ); } } } for (i=0; i<count*3; i++) buf[i] = -1; MPI_Recv( buf, count, newtype, source, 1, comm, &status ); for (i=0; i<count; i++) { if (buf[3*i] != i) { errs++; if (errs < 10) { printf( "buf[3*%d] = %d\n", i, buf[i] ); } } } } } MPI_Type_free( &newtype ); MTest_Finalize( errs ); MPI_Finalize(); return 0; }
int main ( int argc, char *argv[] ) { // Solution arrays real *h_u; /* to be allocated in ROOT only */ real *t_u; real *t_un; // Auxiliary variables int rank; int size; int step; dmn domain; double wtime; int nbrs[6]; int i, j, k; // Initialize MPI MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); // if number of np != Sx*Sy*Sz then terminate. if (size != SX*SY*SZ){ if (rank==ROOT) fprintf(stderr,"%s: Needs at least %d processors.\n", argv[0], SX*SY*SZ); MPI_Finalize(); return 1; } // verify subsizes if (NX%SX!=0 || NY%SY!=0 || NZ%SZ!=0) { if (rank==ROOT) fprintf(stderr,"%s: Subdomain sizes not an integer value.\n", argv[0]); MPI_Finalize(); return 1; } // Build a 2D cartessian communicator MPI_Comm Comm3d; int ndim=3; int dim[3]={SZ,SY,SX}; // domain decomposition subdomains int period[3]={false,false,false}; // periodic conditions int reorder={true}; // allow reorder if necesary int coord[3]; MPI_Cart_create(MPI_COMM_WORLD,ndim,dim,period,reorder,&Comm3d); MPI_Comm_rank(Comm3d,&rank); // rank wrt to Comm2d MPI_Cart_coords(Comm3d,rank,3,coord); // rank coordinates // Map the neighbours ranks MPI_Cart_shift(Comm3d,0,1,&nbrs[TOP],&nbrs[BOTTOM]); MPI_Cart_shift(Comm3d,1,1,&nbrs[NORTH],&nbrs[SOUTH]); MPI_Cart_shift(Comm3d,2,1,&nbrs[WEST],&nbrs[EAST]); // Manage Domain sizes domain = Manage_Domain(rank,size,coord,nbrs); // Allocate Memory Manage_Memory(0,domain,&h_u,&t_u,&t_un); // Root mode: Build Initial Condition if (domain.rank==ROOT) Call_IC(2,h_u); // Build MPI data types MPI_Datatype myGlobal; MPI_Datatype myLocal; MPI_Datatype xySlice; MPI_Datatype yzSlice; MPI_Datatype xzSlice; //Manage_DataTypes(0,domain,&xySlice,&yzSlice,&xzSlice,&myLocal,&myGlobal); // Build a MPI data type for a subarray in Root processor MPI_Datatype global; int nx = domain.nx; int ny = domain.ny; int nz = domain.nz; int bigsizes[3] = {NZ,NY,NX}; int subsizes[3] = {nz,ny,nx}; int starts[3] = {0,0,0}; MPI_Type_create_subarray(3, bigsizes, subsizes, starts, MPI_ORDER_C, MPI_CUSTOM_REAL, &global); MPI_Type_create_resized(global, 0, nx*sizeof(real), &myGlobal); // extend the type MPI_Type_commit(&myGlobal); // Build a MPI data type for a subarray in workers int bigsizes2[3] = {R+nz+R,R+ny+R,R+nx+R}; int subsizes2[3] = {nz,ny,nx}; int starts2[3] = {R,R,R}; MPI_Type_create_subarray(3, bigsizes2, subsizes2, starts2, MPI_ORDER_C, MPI_CUSTOM_REAL, &myLocal); MPI_Type_commit(&myLocal); // now we can use this MPI costum data type // halo data types MPI_Datatype yVector; MPI_Type_vector( ny, nx, nx+2*R, MPI_CUSTOM_REAL, &xySlice); MPI_Type_commit(&xySlice); MPI_Type_vector( ny, 1, nx+2*R, MPI_CUSTOM_REAL, &yVector); MPI_Type_create_hvector(nz, 1, (nx+2*R)*(ny+2*R)*sizeof(real), yVector, &yzSlice); MPI_Type_commit(&yzSlice); MPI_Type_vector( nz, nx, (nx+2*R)*(ny+2*R), MPI_CUSTOM_REAL, &xzSlice); MPI_Type_commit(&xzSlice); // build sendcounts and displacements in root processor int sendcounts[size], displs[size]; if (rank==ROOT) { for (i=0; i<size; i++) sendcounts[i]=1; int disp = 0; // displacement counter for (k=0; k<SZ; k++) { for (j=0; j<SY; j++) { for (i=0; i<SX; i++) { displs[i+SX*j+SX*SY*k]=disp; disp+=1; // x-displacements } disp += SX*(ny-1); // y-displacements } disp += SX*NY*(nz-1); // z-displacements } } // Scatter global array data and exchange halo regions MPI_Scatterv(h_u, sendcounts, displs, myGlobal, t_u, 1, myLocal, ROOT, Comm3d); Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_u); MPI_Barrier(Comm3d); // ROOT mode: Record the starting time. if (rank==ROOT) wtime=MPI_Wtime(); // Asynchronous MPI Solver for (step = 0; step < NO_STEPS; step+=2) { // print iteration in ROOT mode if (rank==ROOT && step%10000==0) printf(" Step %d of %d\n",step,(int)NO_STEPS); // Exchange Boundaries and compute stencil Call_Laplace(domain,&t_u,&t_un);Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_un);//1stIter Call_Laplace(domain,&t_un,&t_u);Manage_Comms(domain,Comm3d,xySlice,yzSlice,xzSlice,t_u );//2ndIter } // ROOT mode: Record the final time. if (rank==ROOT) { wtime = MPI_Wtime()-wtime; printf ("\n Wall clock elapsed = %f seconds\n\n", wtime ); } /* // CAREFUL: uncomment only for debugging. Print subroutine for (int p=0; p<size; p++) { if (rank == p) { printf("Local process on rank %d is:\n", rank); for (k=0; k<nz+2*R; k++) { printf("-- layer %d --\n",k); for (j=0; j<ny+2*R; j++) { putchar('|'); for (i=0; i<nx+2*R; i++) printf("%3.0f ",t_u[i+(nx+2*R)*j+(nx+2*R)*(ny+2*R)*k]); printf("|\n"); } printf("\n"); } } MPI_Barrier(Comm3d); }*/ // gather all pieces into the big data array MPI_Gatherv(t_u, 1, myLocal, h_u, sendcounts, displs, myGlobal, ROOT, Comm3d); // save results to file //if (rank==0) Print(h_u,NX,NY,NZ); if (rank==ROOT) Save_Results(h_u); // Free MPI types Manage_DataTypes(1,domain,&xySlice,&yzSlice,&xzSlice,&myLocal,&myGlobal); // Free Memory Manage_Memory(1,domain,&h_u,&t_u,&t_un); // finalize MPI MPI_Finalize(); // ROOT mode: Terminate. if (rank==ROOT) { printf ("HEAT_MPI:\n" ); printf (" Normal end of execution.\n\n" ); } return 0; }
int main(int argc, char **argv) { if (MPI_Init(&argc, &argv) != MPI_SUCCESS) { fprintf(stderr, "MPI initialization failed.\n"); return 1; } int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (size < 2) { fprintf(stderr, "cant play this game alone.\n"); return 1; } struct { float _a[3]; int _34[2]; char _0; unsigned _6; int _b; unsigned _7; int _9; short _5; unsigned _8; double _c; float _12[2]; char _d; } recv[size]; memset(recv, 0, sizeof(recv)); MPI_Datatype tmp, recv_type, send_type; int recv_blocklengths[] = { 1, 2, 2, 1, 1, 1, 1, 1 }; MPI_Aint recv_displacements[] = { (char *)&recv->_0 - (char *)recv, (char *)recv->_12 - (char *)recv, (char *)recv->_34 - (char *)recv, (char *)&recv->_5 - (char *)recv, (char *)&recv->_6 - (char *)recv, (char *)&recv->_7 - (char *)recv, (char *)&recv->_8 - (char *)recv, (char *)&recv->_9 - (char *)recv }; MPI_Datatype recv_types[] = { MPI_CHAR, MPI_FLOAT, MPI_INT, MPI_SHORT, MPI_UNSIGNED, MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT }; MPI_Type_create_struct(8, recv_blocklengths, recv_displacements, recv_types, &tmp); MPI_Type_create_resized(tmp, 0, (char *)(recv+1) - (char *)recv, &recv_type); MPI_Type_free(&tmp); MPI_Type_commit(&recv_type); struct { char _0; float _12[2]; int _3; float _a; int _4; short _5; char _b[5]; unsigned _678[3]; int _9; long _c; } send; send._0 = rank + 0; send._12[0] = rank + 1; send._12[1] = rank + 2; send._3 = rank + 3; send._4 = rank + 4; send._5 = rank + 5; send._678[0] = rank + 6; send._678[1] = rank + 7; send._678[2] = rank + 8; send._9 = rank + 9; int send_blocklengths[] = { 1, 2, 1, 1, 1, 3, 1 }; MPI_Aint send_displacements[] = { (char *)&send._0 - (char *)&send, (char *)send._12 - (char *)&send, (char *)&send._3 - (char *)&send, (char *)&send._4 - (char *)&send, (char *)&send._5 - (char *)&send, (char *)send._678 - (char *)&send, (char *)&send._9 - (char *)&send }; MPI_Datatype send_types[] = { MPI_CHAR, MPI_FLOAT, MPI_INT, MPI_INT, MPI_SHORT, MPI_UNSIGNED, MPI_INT }; MPI_Type_create_struct(7, send_blocklengths, send_displacements, send_types, &tmp); MPI_Type_create_resized(tmp, 0, sizeof(send), &send_type); MPI_Type_free(&tmp); MPI_Type_commit(&send_type); if (MPI_Allgather(&send, 1, send_type, recv, 1, recv_type, MPI_COMM_WORLD)) { fprintf(stderr, "MPI_Allgather failed.\n"); MPI_Abort(MPI_COMM_WORLD, 1); } for (int j = 0; j < size; j++) { MPI_Barrier(MPI_COMM_WORLD); if (j == rank) { fprintf(stderr, "[ %d ] received:", rank); for (int i = 0; i < size; i++) fprintf(stderr, " (%d %g %g %d %d %d %d %d %d %d)", recv[i]._0, recv[i]._12[0], recv[i]._12[1], recv[i]._34[0], recv[i]._34[1], recv[i]._5, recv[i]._6, recv[i]._7, recv[i]._8, recv[i]._9); fprintf(stderr, "\n"); } } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int rank, size; // My rank and total # of proc int row_rank, col_rank; // My row and column rank int coord[2]; // My coords in grid int dimension; // #of dimensions int dim[2], period[2], reorder; //variables for grid creation int local_N, local_M; // local sizes double *Ax, *Bx; // local matrices double *Sl, *Sr, *Su, *Sd; double hx, hy, hz; // variables double nev = 0.; int i_start, i_end, j_start, j_end; int iter = 0; // iteration # hx = hy = hz = 1; i_start = j_start = 0; Sr = Sl = Su = Sd = NULL; MPI_Comm cart_comm; // Grid comm MPI_Comm col_comm; // My column comm MPI_Comm row_comm; // My row comm MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); // Forming a grid switch (size) { case 1: // 1x1 dim[0] = 1; dim[1] = 1; dimension = 2; break; case 2: // 1x2 dim[0] = 1; dim[1] = 2; dimension = 2; break; case 4: // 2x2 or 4x1 or 1x4 #ifdef SQUARE dim[0] = 2; dim[1] = 2; dimension = 2; #endif // SQUARE #ifndef SQUARE dim[0] = 1; dim[1] = 4; dimension = 2; #endif // ROW break; case 8: #ifdef SQUARE dim[0] = 2; dim[1] = 4; dimension = 2; #endif // SQUARE #ifndef SQUARE dim[0] = 1; dim[1] = 8; dimension = 2; #endif // ROW break; case 9: #ifdef SQUARE dim[0] = 3; dim[1] = 3; dimension = 2; #endif // SQUARE break; default: printf("Please run with 1, 2, 4 or 8 processes.\n"); fflush(stdout); MPI_Abort(MPI_COMM_WORLD, 1); break; } local_N = i_end = N / dim[0]; local_M = j_end = K / dim[1]; // No wrap around period[0] = 0; period[1] = 0; // Reordering ranks in grid comm reorder = 1; MPI_Cart_create(MPI_COMM_WORLD, dimension, dim, period, reorder, &cart_comm); // Get new rank and coords in new cartesian comm MPI_Comm_rank(cart_comm, &rank); MPI_Cart_coords(cart_comm, rank, dimension, coord); // Create comms for rows and columns int var_coord[2]; // Column comm var_coord[0] = 1; var_coord[1] = 0; MPI_Cart_sub(cart_comm, var_coord, &col_comm); MPI_Comm_rank(col_comm, &col_rank); //Row comm var_coord[0] = 0; var_coord[1] = 1; MPI_Cart_sub(cart_comm, var_coord, &row_comm); MPI_Comm_rank(row_comm, &row_rank); if (coord[0] == 0 || coord[0] == (dim[0] - 1)) local_N += 1; else local_N += 2; if (coord[1] == 0 || coord[1] == (dim[1] - 1)) local_M += 1; else local_M += 2; #ifdef DEBUG for (int i = 0; i < size; i++) { if (rank == i) printf("Rank = %d , Col_rank = %d, Row_rank = %d, coordinates are %d %d local N = %d local M = %d \n ", rank, col_rank, row_rank, coord[0], coord[1], local_N, local_M); fflush(stdout); } MPI_Barrier(cart_comm); #endif // DEBUG //Init local A and B Ax = new double[local_N*local_M*K]; Bx = new double[local_N*local_M*K]; for (int k = 0; k < K; k++) { for (int i = 0; i < local_N; i++) { for (int j = 0; j < local_M; j++) { Ax[local_N*local_M*k + local_M*i + j] = 1; Bx[local_N*local_M*k + local_M*i + j] = 1; } } } // Create types for shadows MPI_Datatype Type_H, Type_lr, Type_ud;//Type_right,Type_down,Type_up; MPI_Type_vector(local_N*K, 1, local_M, MPI_DOUBLE, &Type_H); MPI_Type_create_resized(Type_H, 0, sizeof(MPI_DOUBLE) * 2, &Type_lr); MPI_Type_commit(&Type_lr); MPI_Type_vector(K, local_M, local_M*local_N, MPI_DOUBLE, &Type_H); MPI_Type_create_resized(Type_H, 0, local_M*sizeof(MPI_DOUBLE) * 2, &Type_ud); MPI_Type_commit(&Type_ud); //Need to start iterations double fx, fy, fz; fx = fy = fz = 0.; int ijk = 0; double start, finish; start = MPI_Wtime(); #ifdef EASY while (iter < N_ITER) { for (int k = 1; k < K - 1; k++) { for (int i = 1; i < local_N - 1; i++) { for (int j = 1; j < local_M - 1; j++) { fx = (Ax[local_N*local_M*k + local_M*(i + 1) + j] + Ax[local_N*local_M*k + local_M*(i - 1) + j]) / (hx*hx); fy = (Ax[local_N*local_M*k + local_M*i + j + 1] + Ax[local_N*local_M*k + local_M*i + j - 1]) / (hy*hy); fz = (Ax[local_N*local_M*(k + 1) + local_M*i + j] + Ax[local_N*local_M*(k - 1) + local_M*i + j]) / (hz*hz); Bx[local_N*local_M*k + local_M*i + j] = (fx + fy + fz) / (2 / (hx*hx) + 2 / (hy*hy) + 2 / (hz*hz)); // Need to comp nev } } } for (int k = 1; k < K - 1; k++) { for (int i = 1; i < local_N - 1; i++) { for (int j = 1; j < local_M - 1; j++) { ijk = local_N*local_M*k + local_M*i + j; Ax[ijk] = Bx[ijk]; } } } // Sending and recv slice in row to row + 1 if (row_rank != dim[1] - 1) MPI_Sendrecv(&Bx[local_M - 2], 1, Type_lr, row_rank + 1, 0, &Ax[local_M - 1], 1, Type_lr, row_rank + 1, 0, row_comm, &status); // Sending and recv slice in row to row - 1 if (row_rank != 0) MPI_Sendrecv(&Bx[1], 1, Type_lr, row_rank - 1, 0, Ax, 1, Type_lr, row_rank - 1, 0, row_comm, &status); // Sending and recv slice in column to column + 1 if (col_rank != dim[0] - 1) MPI_Sendrecv(&Bx[(local_N - 2)*local_M], 1, Type_ud, col_rank + 1, 0, &Ax[(local_N - 1)*local_M], 1, Type_ud, col_rank + 1, 0, col_comm, &status); // Sending and recv slice in column to column - 1 if (col_rank != 0) MPI_Sendrecv(&Bx[local_M], 1, Type_ud, col_rank - 1, 0, Ax, 1, Type_ud, col_rank - 1, 0, col_comm, &status); iter++; } #endif #ifndef EASY while (iter < N_ITER) { // recv from i-1 and j-1 if (coord[0] != 0 || coord[1] != 0) { if (col_rank > 0) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Waiting slice from COLUMN - 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Recv(Ax, 1, Type_ud, col_rank - 1, 0, col_comm, &status); if (row_rank > 0) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Recv(Ax, 1, Type_lr, row_rank - 1, 0, row_comm, &status); } } else { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Recv(Ax, 1, Type_lr, row_rank - 1, 0, row_comm, &status); } } #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- COMPUTING \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG // computing for (int k = 1; k < K - 1; k++) { for (int i = 1; i < local_N - 1; i++) { for (int j = 1; j < local_M - 1; j++) { fx = (Ax[local_N*local_M*k + local_M*(i + 1) + j] + Ax[local_N*local_M*k + local_M*(i - 1) + j]) / (hx*hx); fy = (Ax[local_N*local_M*k + local_M*i + j + 1] + Ax[local_N*local_M*k + local_M*i + j - 1]) / (hy*hy); fz = (Ax[local_N*local_M*(k + 1) + local_M*i + j] + Ax[local_N*local_M*(k - 1) + local_M*i + j]) / (hz*hz); Ax[local_N*local_M*k + local_M*i + j] = (fx + fy + fz) / (2 / (hx*hx) + 2 / (hy*hy) + 2 / (hz*hz)); // Need to comp nev } } } // send to i-1 j-1 if (coord[0] != 0 || coord[1] != 0) { if (col_rank > 0) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Sending slice to COLUMN - 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Send(&Ax[local_M], 1, Type_ud, col_rank - 1, 0, col_comm); if (row_rank > 0) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Sending slice to ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Send(&Ax[1], 1, Type_lr, row_rank - 1, 0, row_comm); } } else { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Sending slice to ROW - 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Send(&Ax[1], 1, Type_lr, row_rank - 1, 0, row_comm); } } //send and recv from i+1 j+1 if (coord[0] != dim[0] - 1 || coord[1] != dim[1] - 1) { // Send if (col_rank < dim[0] - 1) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Sending slice to COLUMN + 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Send(&Ax[(local_N - 2)*local_M], 1, Type_ud, col_rank + 1, 0, col_comm); if (row_rank < dim[1] - 1) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Sending slice to ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Send(&Ax[local_M - 2], 1, Type_lr, row_rank + 1, 0, row_comm); } } else { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Sending slice to ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Send(&Ax[local_M - 2], 1, Type_lr, row_rank + 1, 0, row_comm); } // Recv if (col_rank < dim[0] - 1) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Waiting slice from COLUMN + 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Recv(&Ax[(local_N - 1)*local_M], 1, Type_ud, col_rank + 1, 0, col_comm, &status); if (row_rank < dim[1] - 1) { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Recv(&Ax[local_M - 1], 1, Type_lr, row_rank + 1, 0, row_comm, &status); } } else { #ifdef DEBUG printf("Rank %d coordinates are %d %d ------- Waiting slice from ROW + 1 \n ", rank, coord[0], coord[1]); fflush(stdout); #endif // DEBUG MPI_Recv(&Ax[local_M - 1], 1, Type_lr, row_rank + 1, 0, row_comm, &status); } } //if (rank == 0) //printf("Iter %d done \n ",iter); fflush(stdout); iter++; } #endif finish = MPI_Wtime(); double loc_comp_time = finish - start; double max_time; MPI_Allreduce(&loc_comp_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if (rank == 0) printf("NxMxK = %dx%dx%d \nGrid - %dx%d \nLongest time %g\n ",N,M,K,dim[0],dim[1], max_time); fflush(stdout); MPI_Finalize(); return 0; }
GrayScott::GrayScott(int N, double rmin, double rmax, double dt, double Du, double Dv, double F, double k, int nSteps, std::string pngname, world_info w, bool localtranspose, unsigned int nthreads) : N_(N) , Ntot_(N*N) // , L_(L) , dx_((double) (rmax-rmin) / (double) (N-1)) , dt_(dt)//(dx_*dx_ / (2.*std::max(Du,Dv))) , nSteps_(nSteps) , currStep_(0) , Du_(Du) , Dv_(Dv) , uCoeff(Du_*dt_/(2.*dx_*dx_)) , vCoeff(Dv_*dt_/(2.*dx_*dx_)) , F_(F) , k_(k) , matU1_(N, -Du*dt/(2.*dx_*dx_), 1.+Du*dt/(dx_*dx_), -Du*dt/(2.*dx_*dx_)) // , matU2_(N, -Du*dt/(2.*dx_*dx_), 1.+Du*dt/(dx_*dx_), -Du*dt/(2.*dx_*dx_)) // equal to matU1_, since we have a square grid (dx==dy) , matV1_(N, -Dv*dt/(2.*dx_*dx_), 1.+Dv*dt/(dx_*dx_), -Dv*dt/(2.*dx_*dx_)) // , matV2_(N, -Dv*dt/(2.*dx_*dx_), 1.+Dv*dt/(dx_*dx_), -Dv*dt/(2.*dx_*dx_)) , pngName_(pngname) , world(w) , rmin_(rmin) , rmax_(rmax) , localtranspose_(localtranspose) , nthreads_(nthreads) { if (world.rank == 0) { // create directory to save output to time_t rawtime; struct tm * timeinfo; char buffer[80]; time (&rawtime); timeinfo = localtime(&rawtime); strftime(buffer,80,"%d-%m-%Y_%H-%M-%S",timeinfo); std::string timeString(buffer); dirPath_ = "data/" + timeString + "/"; boost::filesystem::path dir(dirPath_); // boost::filesystem::create_directory(dir); } // global grid Nx_glo = N; Ny_glo = N; NN_glo = Nx_glo * Ny_glo; // local grid Nx_loc = Nx_glo / world.dims_x; Ny_loc = Ny_glo / world.dims_y; NN_loc = Nx_loc * Ny_loc; Nb_loc = Ny_loc/Nx_loc; // build process geometry with cartesian communicator int periods[2] = {false, false}; int dims[2] = {world.dims_x, world.dims_y}; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, true, &cart_comm); MPI_Comm_rank(cart_comm, &world.cart_rank); MPI_Cart_shift(cart_comm, 0, 1, &world.top_proc, &world.bottom_proc); int coords[2]; MPI_Cart_coords(cart_comm, world.cart_rank, 2, coords); world.coord_x = coords[0]; world.coord_y = coords[1]; // datatypes // build contiguous (rows) vectors for boundaries. // each process has multiple rows in the grid MPI_Type_contiguous(Ny_loc, MPI_DOUBLE, &bottom_boundary); MPI_Type_commit(&bottom_boundary); MPI_Type_contiguous(Ny_loc, MPI_DOUBLE, &top_boundary); MPI_Type_commit(&top_boundary); // build datatypes for transpose MPI_Datatype block_send, block_col, block_recv; // send datatype int sizes[2] = {Nx_loc, Ny_loc}; // size of global array int subsizes[2] = {Nx_loc, Nx_loc}; // size of sub-region (square) int starts[2] = {0,0}; // where does the first subarray begin (which index) MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &block_send); MPI_Type_commit(&block_send); // resize -> make contiguous MPI_Type_create_resized(block_send, 0, Nx_loc*sizeof(double), &block_resized_send); MPI_Type_free(&block_send); MPI_Type_commit(&block_resized_send); // receive datatype MPI_Type_vector(Nx_loc, 1, Ny_loc, MPI_DOUBLE, &block_col); MPI_Type_commit(&block_col); MPI_Type_hvector(Nx_loc, 1, sizeof(double), block_col, &block_recv); MPI_Type_free(&block_col); MPI_Type_commit(&block_recv); // resize data structure, so that it is contigious (for alltoall) MPI_Type_create_resized(block_recv, 0, 1*sizeof(double), &block_resized_recv); MPI_Type_free(&block_recv); MPI_Type_commit(&block_resized_recv); // sub-domain boundaries xmin_loc = rmin + world.coord_x * Nx_loc * dx_; xmax_loc = xmin_loc + (Nx_loc - 1) * dx_; ymin_loc = rmin + world.coord_y * Ny_loc * dx_; ymax_loc = ymin_loc + (Ny_loc - 1) * dx_; initialize_fields(); MPI_Barrier(MPI_COMM_WORLD); }
void convolution(int my_id, int p){ int i, j, k; int chunkSize; double start, end; double time[14]; /* Input data */ float input_1[N][N], input_2[N][N]; /* Output data */ float output[N][N]; /* Set the chunk size for each processor */ chunkSize = N/p; /* Two arrays storing the local data distributed by rank 0 */ float local_data1[N][N], local_data2[N][N]; /* Local matrix for matrix multiplication */ float local_data3[chunkSize][N]; /* A complex array storing the temp row to operate FFT */ complex temp_data[N]; /* Initialization of the original Matrix and distribution of data */ if(my_id == 0){ printf("2D convolution using SPMD model and MPI Collective operations\n"); start = MPI_Wtime(); /*Read data from the files*/ readFile(input_1, input_2); time[0] = MPI_Wtime(); printf("Reading file takes %f s.\n", time[0] - start); } /* Scatter all the data to local data */ MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data1, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Scatter(input_2, chunkSize*N, MPI_FLOAT, local_data2, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); /* Compute time for distributing data */ if(my_id == 0){ time[1] = MPI_Wtime(); printf("Scattering data of rows to each processor takes %f s.\n", time[1] - time[0]); } /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data1[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data1[i][j] = temp_data[j].r; for(j = 0; j < N; j++){ /* FFT each row for im2 */ temp_data[j].r = local_data2[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data2[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(my_id == 0){ time[2] = MPI_Wtime(); printf("FFT each row for input im1 and im2 takes %f s.\n", time[2] - time[1]); } MPI_Gather(local_data1, chunkSize*N, MPI_FLOAT, input_1, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT, input_2, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); if(my_id == 0){ time[3] = MPI_Wtime(); printf("Gathering all the data from different rows takes %f s.\n", time[3] - time[2]); } /* Initialize a new vector for distributing columns */ MPI_Datatype column, col; /* Column vector */ MPI_Type_vector(N, 1, N, MPI_FLOAT, &col); MPI_Type_commit(&col); MPI_Type_create_resized(col, 0, 1*sizeof(float), &column); MPI_Type_commit(&column); /* Scatter all the data to column local data */ MPI_Scatter(input_1, chunkSize, column, local_data1, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Scatter(input_2, chunkSize, column, local_data2, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if(my_id == 0){ time[4] = MPI_Wtime(); printf("Scattering data of columns to each processor takes %f s.\n", time[4] - time[3]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data1[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data1[j][i] = temp_data[j].r; for(j = 0; j < N; j++){ /* FFT each column for im2 */ temp_data[j].r = local_data2[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data2[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(my_id == 0){ time[5] = MPI_Wtime(); printf("FFT each column for input im1 and im2 takes %f s.\n", time[5] - time[4]); } MPI_Gather(local_data1, chunkSize, column, input_1, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Gather(local_data2, chunkSize, column, input_2, chunkSize, column, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); /* Compute time and distribute data to do matrix multiplication */ if(my_id == 0){ time[6] = MPI_Wtime(); printf("Gathering all the data from different columns takes %f s.\n", time[6] - time[5]); } MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data1, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); /* Broadcast data2 to all the ranks */ MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if(my_id == 0){ time[7] = MPI_Wtime(); printf("Scattering data for multiplication takes %f s.\n", time[7] - time[6]); } /* Matrix multiplication */ for(i = 0; i < chunkSize; i++) for(j = 0; j < N; j++) for(k = 0; k < N; k++) local_data3[i][j] += local_data1[i][k]*input_2[k][j]; /* Collect multiplication results from each rank */ if(my_id == 0){ time[8] = MPI_Wtime(); printf("Matrix multiplication takes %f s.\n", time[8] - time[7]); } /* Inverse-2DFFT(row) for the output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data3[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data3[i][j] = temp_data[j].r; } if(my_id == 0){ time[9] = MPI_Wtime(); printf("Inverse-2DFFT for out_1(row) takes %f s.\n", time[9] - time[8]); } MPI_Gather(local_data3, chunkSize*N, MPI_FLOAT, output, chunkSize*N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); if(my_id == 0){ time[10] = MPI_Wtime(); printf("Gathering all the data of Inverse-2DFFT for out_1(row) takes %f s.\n", time[10] - time[9]); } MPI_Scatter(output, chunkSize, column, local_data1, chunkSize, column, 0, MPI_COMM_WORLD); if(my_id == 0){ time[11] = MPI_Wtime(); printf("Scattering out_1(column) to each processor takes %f s.\n", time[11] - time[10]); } /* Inverse-2DFFT(column) for the output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data1[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data1[j][i] = temp_data[j].r; } /* Gathering all the columns of the output file from each rank */ if(my_id == 0){ time[12] = MPI_Wtime(); printf("Inverse-2DFFT out_1(column) takes %f s.\n", time[12] - time[11]); } MPI_Gather(local_data1, chunkSize, column, output, chunkSize, column, 0, MPI_COMM_WORLD); if(my_id == 0){ time[13] = MPI_Wtime(); printf("Gathering all the data of the output file(column) takes %f s.\n", time[13] - time[12]); writeFile(output); end = MPI_Wtime(); printf("Writing the output file to file takes %f s.\n", end - time[13]); printf("Total communication time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[13] - time[12] + time[11] - time[10] + time[7] - time[5] + time[4] - time[2] + time[1] - time[0]); printf("Total computing time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[12] - time[11] + time[10] - time[7] + time[5] - time[4] + time[2] - time[1]); printf("Total running time without loading/writing of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", time[13] - time[0]); printf("Total running time of 2D convolution using MPI_Scatter&MPI_Gather takes %f s.\n", end - start); } /* Free vector column */ MPI_Type_free(&column); MPI_Type_free(&col); }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int p, rank; MPI_Comm_size(MPI_COMM_WORLD, &p); MPI_Comm_rank(MPI_COMM_WORLD, &rank); int a[ROWS_A*COLS_A]; int B[ROWS_B*COLS_B]; int C[ROWS_C*COLS_C]; const int BLOCKROWS = ROWS_C; /* number of rows in _block_ */ const int BLOCKCOLS = COLS_C/p; /* number of cols in _block_ */ /* Puede provocar que no se realice algún cálculo. Hay dos formas de enfocarlo. Que uno calcule más,o recalcular con un proceso menos para que sea par */ const int NPROWS=1; /* number of rows in _decomposition_ */ /* por nuestro enfoque, cada proceso devuelve una fila */ const int NPCOLS=p; /* number of cols in _decomposition_ */ /* por nuestro enfoque, cada proceso devuelve columnas igual al número de procesos */ if (rank == 0) { /* Relleno */ for (int ii=0; ii<ROWS_A*COLS_A; ii++) { a[ii] = ii; } for (int ii=0; ii<ROWS_B*COLS_B; ii++) { b[ii] = ii; } } /* Controla que la multiplicación de matrices cumple la regla */ if (COLS_A != ROWS_B) { fprintf(stderr,"Error: number of array dimension %d != %d ", COLS_A, ROWS_B); MPI_Finalize(); exit(-1); } /* Controla que el numero de procesos coincide con los datos a procesar */ if (p != NPROWS*NPCOLS) { fprintf(stderr,"Error: number of PEs %d != %d x %d\n", p, NPROWS, NPCOLS); MPI_Finalize(); exit(-1); } /* Para cada proceso, donde guarda los datos recibidos */ int r[BLOCKROWS*BLOCKCOLS]; //3 x ( 3 / p ) for (int ii=0; ii<BLOCKROWS*BLOCKCOLS; ii++) r[ii] = 0; //inicialización vector que usa cada proceso MPI_Datatype blocktype; MPI_Datatype blocktype2; //Primer vector A MPI_Type_vector( COLS_A, //Número de elementos, que corresponden con el tamaño de la fila ROWS_A/p, //tamaño de cada uno COLS_A, //offset, desplazamiento con el siguiente MPI_INT, &blocktype2); MPI_Type_create_resized( //Por ahora sin explicación, ya lo dirá Daniel blocktype2, 0, sizeof(int), &blocktype); MPI_Type_commit(&blocktype); int disps[NPROWS*NPCOLS]; //Los organizamos en una sola fila int counts[NPROWS*NPCOLS]; for (int ii=0; ii<NPROWS; ii++) { for (int jj=0; jj<NPCOLS; jj++) { disps[ii*NPCOLS+jj] = ii*COLS*BLOCKROWS+jj*BLOCKCOLS; //Mismo q en la transparencia, todos del mismo tamaño counts [ii*NPCOLS+jj] = 1; //Esto puede complicarse, si se incluye condiciones de borde en algunos procesos } } MPI_Scatterv( a, //Matriz origen counts, //numero de elementos de cada proceso disps, //Desplazamientos blocktype, r, //Donde se acumulan los datos recibidos BLOCKROWS*BLOCKCOLS, //Tamaño del vector b MPI_INT, 0, //proceso root MPI_COMM_WORLD); /* each proc prints it's "b" out, in order */ //Esto se hace así para que sea en orden, solo lo hace cada proceso si le toca. En procedimientos no ordenados, no hace falta for (int proc=0; proc<p; proc++) { if (proc == rank) { printf("Rank = %d\n", rank); if (rank == 0) { printf("Global matrix: \n"); for (int ii=0; ii<ROWS; ii++) { for (int jj=0; jj<COLS; jj++) { printf("%3d ",(int)a[ii*COLS+jj]); } printf("\n"); } } printf("Local Matrix:\n"); for (int ii=0; ii<BLOCKROWS; ii++) { for (int jj=0; jj<BLOCKCOLS; jj++) { printf("%3d ",(int)b[ii*BLOCKCOLS+jj]); } printf("\n"); } printf("\n"); } MPI_Barrier(MPI_COMM_WORLD); //todos los procesos deben llegar hasta aqui para poder continuar } if (rank == 0) { memset(a, 0, ROWS*COLS); //Borra la matriz original a 0 printf("Global matrix again: \n"); for (int ii=0; ii<ROWS; ii++) { for (int jj=0; jj<COLS; jj++) { printf("%3d ",(int)a[ii*COLS+jj]); } printf("\n"); } } MPI_Gatherv(b, BLOCKROWS*BLOCKCOLS, MPI_CHAR, a, counts, disps, blocktype, 0, MPI_COMM_WORLD); if (rank == 0) { printf("Global matrix again: \n"); for (int ii=0; ii<ROWS; ii++) { for (int jj=0; jj<COLS; jj++) { printf("%3d ",(int)a[ii*COLS+jj]); } printf("\n"); } } MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { //initialize int i, j, k, l, mpi_rank, mpi_size, mpi_rowsize, mpi_colsize, subN, sqrtP; int row_rank, col_rank, row, col, destR, destC, src, srcR, srcC; // declare variables to store time of parallelism double execTime, execStart, execEnd; MPI_Comm rowComm, colComm; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); if (argc < 2) { printf("error: need one argument for filename"); exit(-1); } // declare variables of type vector to read matrices vector * mat1; vector * mat2; float * vector1; float * vector2; //create data on root // read matrices if (mpi_rank == ROOT) { mat1 = readfile(argv[1], N, N); mat2 = readfile(argv[2], N, N); vector1 = mat1->data; vector2 = mat2->data; } // find the sub matrix dimention sqrtP = (int) sqrt(mpi_size); subN = N/sqrtP; //allocate memory for N/4xN rows, and NxN/4 columns float * row_mat, *col_mat, *row_mat_rec, *col_mat_rec, *col_matT; double *can_res, *can_out; //allocate memory for the buffers row_mat = allocateFloatMatrix(subN, subN); row_mat_rec = allocateFloatMatrix(subN, subN); col_mat = allocateFloatMatrix(subN, subN); col_matT = allocateFloatMatrix(subN, subN); col_mat_rec = allocateFloatMatrix(subN, subN); can_res = allocateDoubleMatrix(subN, subN); can_out = allocateDoubleMatrix(N, N); //create and commit datatypes MPI_Datatype arrtype, resized_arrtype, arrtypeD, resized_arrtypeD; int sizes[2] = { N,N }; int subsizes[2] = { subN,subN }; int starts[2] = { 0,0 }; MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_FLOAT, &arrtype); MPI_Type_create_resized(arrtype, 0, subN * sizeof(float), &resized_arrtype); MPI_Type_commit(&resized_arrtype); MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &arrtypeD); MPI_Type_create_resized(arrtypeD, 0, subN * sizeof(double), &resized_arrtypeD); MPI_Type_commit(&resized_arrtypeD); //calculate send counts and displacements int * counts, * displs; counts = (int *) malloc(mpi_size * sizeof(int)); displs = (int *) malloc(mpi_size * sizeof(int)); for(i = 0; i < mpi_size; i++) { counts[i] = 1; displs[i] = N*(i/sqrtP) + (i%sqrtP); } //start timer, compute dot product and record execution time execStart = MPI_Wtime(); //scatterv subarrays MPI_Scatterv(vector1, counts, displs, resized_arrtype, row_mat, subN*subN, MPI_FLOAT, ROOT, MPI_COMM_WORLD); MPI_Scatterv(vector2, counts, displs, resized_arrtype, col_mat, subN*subN, MPI_FLOAT, ROOT, MPI_COMM_WORLD); //get row comm and rank row = mpi_rank/sqrtP; MPI_Comm_split(MPI_COMM_WORLD,row,mpi_rank,&rowComm); MPI_Comm_rank(rowComm,&row_rank); MPI_Comm_size(rowComm, &mpi_rowsize); //get col comm and rank col = mpi_rank%sqrtP; MPI_Comm_split(MPI_COMM_WORLD,col,mpi_rank,&colComm); MPI_Comm_rank(colComm,&col_rank); MPI_Comm_size(colComm, &mpi_colsize); //MPI_Barrier(MPI_COMM_WORLD); // find the source and destination in row communicator - to left shift rows by row number destR = row_rank-row; if (destR < 0) { destR = destR + mpi_rowsize; } srcR = row_rank+row; if (srcR > (mpi_rowsize-1)) { srcR = srcR - mpi_rowsize; } // find the source and destination in column communicator - to north shift columns by column number destC = col_rank - col; if (destC < 0) { destC = destC + mpi_colsize; } srcC = col_rank+col; if (srcC > (mpi_colsize-1)) { srcC = srcC - mpi_colsize; } // left shift rows by row number MPI_Sendrecv(row_mat, subN*subN, MPI_FLOAT, destR, 0, row_mat_rec, subN*subN, MPI_FLOAT, srcR, MPI_ANY_TAG, rowComm, MPI_STATUS_IGNORE); //north shift columns by column number MPI_Sendrecv(col_mat, subN*subN, MPI_FLOAT, destC, 1, col_mat_rec, subN*subN, MPI_FLOAT, srcC, MPI_ANY_TAG, colComm, MPI_STATUS_IGNORE); for (l=0; l<sqrtP; l++) { memcpy(row_mat, row_mat_rec, sizeof(float)*subN*subN); memcpy(col_mat, col_mat_rec, sizeof(float)*subN*subN); // Finding the transpose of matrix B matrixTranspose(subN, col_mat, col_matT); //perform a partial matrix-vector multiplication on each process matrixMultiplyT(subN, row_mat, col_matT, can_res); // find the source and destination in row communicator - to left shift all rows once if (row_rank != 0) { destR = row_rank - 1; } else { destR = mpi_rowsize - 1; } srcR = row_rank + 1; if (srcR == mpi_rowsize) { srcR = 0; } // find the source and destination in column communicator - to north shift all columns once if (col_rank != 0) { destC = col_rank - 1; } else { destC = mpi_colsize - 1; } srcC = col_rank + 1; if (srcC == mpi_colsize) { srcC = 0; } //left shift all rows once MPI_Sendrecv(row_mat, subN*subN, MPI_FLOAT, destR, 2, row_mat_rec, subN*subN, MPI_FLOAT, srcR, MPI_ANY_TAG, rowComm, MPI_STATUS_IGNORE); //north shift all columns once MPI_Sendrecv(col_mat, subN*subN, MPI_FLOAT, destC, 3, col_mat_rec, subN*subN, MPI_FLOAT, srcC, MPI_ANY_TAG, colComm, MPI_STATUS_IGNORE); } // gather the matrix multiplication results from all procs MPI_Gatherv(can_res, subN*subN, MPI_DOUBLE, can_out, counts, displs, resized_arrtypeD, ROOT, MPI_COMM_WORLD); //stop timer execEnd = MPI_Wtime(); execTime = execEnd - execStart; //free datatypes MPI_Type_free(&resized_arrtype); MPI_Type_free(&resized_arrtypeD); if (mpi_rank == ROOT) { printf("Execution time for dot product: %f seconds\n", execTime); printf("Result: %f, %f, %f \n ", can_out[0], can_out[2047*N + 2047], can_out[4095*N + 4095]); free(vector1); free(vector2); } free(row_mat); free(col_mat); free(row_mat_rec); free(col_mat_rec); free(col_matT); free(can_res); free(can_out); //shut down MPI MPI_Finalize(); return 0; }
void step4(inst i, int r, int s) { inst instance = i; int rank = r; int size = s; // Creation of the 2D torus we will then use MPI_Comm comm; int dim[2] = {instance.p, instance.q}; int period[2] = {1, 1}; int reorder = 0; int coord[2]; MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &comm); MPI_Cart_coords(comm, rank, 2, coord); grid global_grid; char type = 0; MPI_File input_file; // We start by reading the header of the file MPI_File_open(comm, instance.input_path, MPI_MODE_RDONLY, MPI_INFO_NULL, &input_file); MPI_File_read_all(input_file, &type, 1, MPI_CHAR, MPI_STATUS_IGNORE); if(type == 1) { if (rank == 0) fprintf(stderr, "Error: type 1 files are not supported in step 4\n"); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); exit(EXIT_FAILURE); } // we needed to swap the next 2 lines MPI_File_read_all(input_file, &(global_grid.n), 1, MPI_UINT64_T, MPI_STATUS_IGNORE); MPI_File_read_all(input_file, &(global_grid.m), 1, MPI_UINT64_T, MPI_STATUS_IGNORE); #ifdef DEBUG if(rank == 0) printf("n, m = %zu %zu\n", global_grid.n, global_grid.m); #endif if(!(global_grid.n % instance.p == 0 && global_grid.m % instance.q == 0)) { if(rank == 0) fprintf(stderr, "Error: please choose the grid parameters so they divide the grid of the cellular automaton. For example %zu %zu, but you need to move from %d procs to %zu\n", instance.p + (global_grid.n % instance.p), instance.q + (global_grid.m % instance.q), size, (instance.p + (global_grid.n % instance.p))*(instance.q + (global_grid.m % instance.q))); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); exit(EXIT_FAILURE); } size_t local_nrows = global_grid.n/instance.p; size_t local_ncols = global_grid.m/instance.q; // Now we create the data structures. int blocks[2] = {1, 2}; MPI_Datatype types[2] = {MPI_BYTE, MPI_DOUBLE}; MPI_Aint a_size = sizeof(cell2); MPI_Aint a_disp[3] = {offsetof(cell2, type), offsetof(cell2, u), offsetof(cell2, s)}; MPI_Aint p_size = 17; MPI_Aint p_disp[3] = {0, 1, 9}; MPI_Datatype p_tmp, a_tmp, p_cell, a_cell; // Aligned struct, memory representation MPI_Type_create_struct(2, blocks, a_disp, types, &a_tmp); MPI_Type_create_resized(a_tmp, 0, a_size, &a_cell); MPI_Type_commit(&a_cell); // Packed struct, file-based representation MPI_Type_create_struct(2, blocks, p_disp, types, &p_tmp); MPI_Type_create_resized(p_tmp, 0, p_size, &p_cell); MPI_Type_commit(&p_cell); // Now, we create our matrix MPI_Datatype matrix; int sizes[2] = {global_grid.n, global_grid.m}; int subsizes[2] = {local_nrows, local_ncols}; int starts[2] = {0, 0}; MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, p_cell, &matrix); MPI_Type_commit(&matrix); // We extend this matrix MPI_Datatype ematrix; int e_subsizes[2] = {2 + subsizes[0], 2 + subsizes[1]}; int e_start[2] = {1, 1}; MPI_Type_create_subarray(2, e_subsizes, subsizes, e_start, MPI_ORDER_C, a_cell, &ematrix); MPI_Type_commit(&ematrix); // The next 3 types are for the export of the grid MPI_Datatype d_type; MPI_Type_create_resized(MPI_DOUBLE, 0, sizeof(cell2), &d_type); MPI_Type_commit(&d_type); MPI_Datatype d_matrix; MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &d_matrix); MPI_Type_commit(&d_matrix); MPI_Datatype d_rmatrix; // to go from the extended matrix with ghost zones to the other one MPI_Type_create_subarray(2, e_subsizes, subsizes, e_start, MPI_ORDER_C, d_type, &d_rmatrix); MPI_Type_commit(&d_rmatrix); // Set file view for each element MPI_Offset grid_start; MPI_File_get_position(input_file, &grid_start); MPI_File_set_view(input_file, grid_start + global_grid.m*local_nrows*p_size*coord[0] + local_ncols*p_size*coord[1], p_cell, matrix, "native", MPI_INFO_NULL); // allocate the cell array we will use cell2 **cells; cells = malloc(2*sizeof(cell2 *)); double *sensors; cells[1] = calloc((2+local_nrows)*(2+local_ncols),sizeof(cell2)); cells[0] = calloc((2+local_nrows)*(2+local_ncols),sizeof(cell2)); sensors = calloc(local_nrows*local_ncols, sizeof(double)); MPI_File_read_all(input_file, cells[0], 1, ematrix, MPI_STATUS_IGNORE); MPI_File_close(&input_file); #ifdef DEBUG for(size_t i = 1; i < 1+local_nrows; i++) for(size_t j = 1; j < 1+local_ncols; j++) fprintf(stderr, "%d - %d %f\n", rank, cells[0][i*(2+local_ncols)+j].type, cells[0][i*(2+local_ncols)+j].u); #endif MPI_Datatype l_row; // local row MPI_Type_contiguous(local_ncols, d_type, &l_row); MPI_Type_commit(&l_row); MPI_Datatype l_col; // local column. A bit trickier, we need a type_vector. MPI_Type_vector(local_nrows, 1, local_ncols+2, d_type, &l_col); MPI_Type_commit(&l_col); int top, bot, left, right; double sqspeed = 0; int curr = 0, next = 0; char *alldump = malloc(256); for(int s = 0; s < instance.iteration; s++) { // We will update cell[next], and use the data of cell[curr] curr = s % 2; next = (s+1) % 2; // We copy the edges of the grid. // We first need the ranks of the neighbours MPI_Cart_shift(comm, 0, 1, &top, &bot); MPI_Cart_shift(comm, 1, 1, &left, &right); // Then we need to update the edges of our local grid // Update top and bottom rows MPI_Sendrecv(&(cells[curr][1*(local_ncols+2)+1].u), 1, l_row, top, 0, &(cells[curr][(local_ncols+2)*(local_nrows+1)+1].u), 1, l_row, bot, 0, comm, MPI_STATUS_IGNORE); MPI_Sendrecv(&(cells[curr][(local_ncols+2)*(local_nrows)+1].u), 1, l_row, bot, 0, &(cells[curr][1].u), 1, l_row, top, 0, comm, MPI_STATUS_IGNORE); // Update left and right MPI_Sendrecv(&(cells[curr][1*(local_ncols+2)+1].u), 1, l_col, left, 0, &(cells[curr][1*(local_ncols+2)+local_ncols+1].u), 1, l_col, right, 0, comm, MPI_STATUS_IGNORE); MPI_Sendrecv(&(cells[curr][1*(local_ncols+2)+local_ncols].u), 1, l_col, right, 0, &(cells[curr][1*(local_ncols+2)].u), 1, l_col, left, 0, comm, MPI_STATUS_IGNORE); // We compute the update of the grid for(size_t i = 1; i < 1+local_nrows; i++) { for(size_t j = 1; j < 1+local_ncols; j++) { if(instance.step < 2 || cells[next][j+i*(2+local_ncols)].type != 1) { // If walls we do not do anything sqspeed = cells[0][j+i*(2+local_ncols)].s * cells[0][j+i*(2+local_ncols)].s; cells[next][j+i*(2+local_ncols)].u = cells[curr][j+i*(2+local_ncols)].u + (cells[curr][j+i*(2+local_ncols)].v * instance.dt); cells[next][j+i*(2+local_ncols)].v = cells[curr][j+i*(2+local_ncols)].v + sqspeed * (cells[curr][j+(i+1)*(2+local_ncols)].u + cells[curr][j+(i-1)*(2+local_ncols)].u + cells[curr][(j+1) + i*(2+local_ncols)].u + cells[curr][(j-1) + i*(2+local_ncols)].u - (4 * cells[curr][j+i*(2+local_ncols)].u)) * instance.dt; if(instance.step == 3 && cells[next][j+i*(2+local_ncols)].type == 2) { // Case of sensors sensors[(j-1)+(i-1)*local_ncols] += cells[next][j+i*(2+local_ncols)].u * cells[next][j+i*(2+local_ncols)].u; } } } } if(instance.alldump != NULL && s % instance.frequency == 0) { MPI_File dump_file; sprintf(alldump, instance.alldump, (s / instance.frequency)); MPI_File_open(comm, alldump, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &dump_file); MPI_File_set_view(dump_file, global_grid.m*local_nrows*sizeof(double)*coord[0] + local_ncols*sizeof(double)*coord[1], MPI_DOUBLE, d_matrix, "native", MPI_INFO_NULL); MPI_File_write_all(dump_file, &(cells[curr][0].u), 1, d_rmatrix, MPI_STATUS_IGNORE); MPI_File_close(&dump_file); } } if(instance.lastdump != NULL) { // bon, comment on fait ça ? peut être qu'en faisant un resize ça marche ? MPI_File last_file; MPI_File_open(comm, instance.lastdump, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &last_file); MPI_File_set_view(last_file, global_grid.m*local_nrows*sizeof(double)*coord[0] + local_ncols*sizeof(double)*coord[1], MPI_DOUBLE, d_matrix, "native", MPI_INFO_NULL); // déjà, il y a un grid_strat en trop, d_type ou MPI_DOUBLE ? MPI_File_write_all(last_file, &(cells[next][0].u), 1, d_rmatrix, MPI_STATUS_IGNORE); MPI_File_close(&last_file); } if(instance.step == 3 && instance.sensors != NULL) { MPI_File sensor_file; MPI_File_open(comm, instance.sensors, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &sensor_file); MPI_Datatype string; MPI_Type_contiguous(1024, MPI_CHAR, &string); MPI_Type_commit(&string); char text[1024]; for(size_t i = 1; i < 1+local_nrows; i++) { for(size_t j = 1; j < 1+local_ncols; j++) { if(instance.step == 3 && cells[next][j+i*(2+local_ncols)].type == 2) { memset(text,0,sizeof(text)); sprintf(text, "%zu %zu %f\r\n", (i-1)+coord[0]*local_nrows, (j-1)+coord[1]*local_ncols, sensors[(j-1)+(i-1)*local_ncols]); MPI_File_write(sensor_file, text, 1, string, MPI_STATUS_IGNORE); } } } MPI_Type_free(&string); MPI_File_close(&sensor_file); } // Some cleaning free(cells); free(alldump); MPI_Type_free(&a_cell); MPI_Type_free(&p_cell); MPI_Type_free(&matrix); MPI_Type_free(&ematrix); MPI_Type_free(&d_type); MPI_Type_free(&d_matrix); MPI_Type_free(&d_rmatrix); MPI_Type_free(&l_row); MPI_Type_free(&l_col); }
int main(int argc, char* argv[]){ MPI_Init(NULL, NULL); int rank, size; int loop, num_alive, loop_iterations; int ldboard, ldnbngb, ldglobalboard; double t1, time, final_time; int periods[2] = {1, 1}; int *globboard= NULL; int *globboard2= NULL; int *board; int *nbngb; /* Initialization of MPI */ MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size); if(argc >= 2){ if(!strcmp("-h",argv[1])){ if(!rank) helper(); MPI_Finalize(); return EXIT_SUCCESS; } } int i, j; int process_per_row = sqrt(size); int process_per_column = sqrt(size); int dims[2] = {process_per_row, process_per_column}; // It only works if the number of process in the input is a perfect square if(size != process_per_column*process_per_row){ fprintf(stderr, "Square Perfect needed as input size.\nExiting Program."); MPI_Finalize(); return EXIT_FAILURE; } MPI_Comm grid; // Initialize cartesian grid MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods,0, &grid); MPI_Comm_rank(grid, &rank); /* User input */ if (argc < 2) { loop_iterations = 10; BS = 30; } else if (argc >= 2){ loop_iterations = atoi(argv[1]); if(argc > 2) BS = atoi(argv[2]); else BS = 30; } num_alive = 0; /*Leading dimension of global board array*/ ldglobalboard = BS + 2; // +2 because of upper and above added (+ X +) /* Leading dimension of board array */ ldboard = BS/process_per_row + 2; // +2 because of upper and above added (+ X +) /* Leading dimension of neigbour array */ ldnbngb = BS/sqrt(size); // Same number of element in each process which is equal to this formula // Initialization of cells board board = (int *)malloc( ldboard * ldboard * sizeof(int) ); nbngb = (int *)malloc( ldnbngb * ldnbngb * sizeof(int) ); // Initialization of global cell board (which is common between all processes) if(!rank){ globboard = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int)); globboard2 = (int *)malloc(ldglobalboard*ldglobalboard * sizeof(int)); num_alive = generate_initial_board( BS, &globboard[1+ldglobalboard] , ldglobalboard ); output_board( BS, &globboard[1+ldglobalboard], ldglobalboard, 0 ); fprintf(stderr, "Starting number of living cells = %d\n", num_alive); } // Matrix block type used by each processes MPI_Datatype block2, block; MPI_Type_vector(ldboard-2, ldboard-2, ldglobalboard, MPI_INT, &block2); MPI_Type_create_resized(block2, 0, sizeof(int), &block); MPI_Type_commit(&block); // Matrix sub block type used by each processes MPI_Datatype sub_block2, sub_block; MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &sub_block2); MPI_Type_create_resized(sub_block2, 0, sizeof(int), &sub_block); MPI_Type_commit(&sub_block); int *process_count = (int*)malloc(size*sizeof(int)); // number of cells per processes int *cell_per_processes = (int*)malloc(size*sizeof(int)); // Prototyping moves for each processes (preparing matrix's scatter) for (i = 0; i < process_per_row; ++i){ for (j = 0; j < process_per_column; ++j){ process_count[i+j*process_per_column]= 1; cell_per_processes[i+j*process_per_column]= i*ldglobalboard*(ldboard-2)+j*(ldboard-2); } } /* Explodes matrix into sub_blocks elements */ MPI_Scatterv(&globboard[1+ldglobalboard], process_count, cell_per_processes, block, &board[ldboard+1], 1, sub_block,0, grid); // Initialize for each processes, a table of the neighbours. int neighbours[8]; neighbour_table(neighbours, grid, rank); /* Time to begin */ t1 = mytimer(); int blocksize = ldboard-2; MPI_Datatype row_blocks; MPI_Type_vector(blocksize, 1, ldboard, MPI_INT, &row_blocks); MPI_Type_commit(&row_blocks); // status for waiting time... MPI_Status mpi_status; // Create as much MPI request as number of neighbours possible (in the worst case 8) MPI_Request cart_request[8]; for (loop = 1; loop <= loop_iterations; ++loop) { /* Start communications to send and recv informations from neighbours */ inter_proc_communications(cart_request, neighbours, grid, blocksize, board, ldboard, row_blocks); /* Compute inside process cells */ for (j = 2; j <= blocksize-1; ++j) { for (i = 2; i <= blocksize-1; ++i) { ngb( i, j ) = cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) + cell( i-1, j ) + cell( i+1, j ) + cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 ); } } /* Computes cells on the border */ // Cell neighbour's composition // 4 2 5 4 4 2 5 4 2 5 4 2 5 // // 0 X 1 --> 0 --> 0 --> 0 1 --> 0 1 // // 6 3 7 6 6 6 7 6 3 7 // /* Column on the left needs data from the left process --> 4, 0, 6*/ MPI_Wait(&cart_request[0], &mpi_status); MPI_Wait(&cart_request[4], &mpi_status); MPI_Wait(&cart_request[6], &mpi_status); process_frontier(1, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb); /* Line above needs data from the above process --> 2, 5 */ MPI_Wait(&cart_request[2], &mpi_status); MPI_Wait(&cart_request[5], &mpi_status); process_frontier(1, blocksize, board, ROW, ldboard, nbngb, ldnbngb); /* Column on the right needs data from the right process --> 1, 7 */ MPI_Wait(&cart_request[1], &mpi_status); MPI_Wait(&cart_request[7], &mpi_status); process_frontier(blocksize, blocksize, board, COLUMN, ldboard, nbngb, ldnbngb); /* Line under needs data from under process --> 3 */ MPI_Wait(&cart_request[3], &mpi_status); process_frontier(blocksize, blocksize, board, ROW, ldboard, nbngb, ldnbngb); /* Update the cell */ num_alive = 0; for (j = 1; j <= blocksize; ++j) { for (i = 1; i <= blocksize; ++i) { if ( (ngb( i, j ) < 2) || (ngb( i, j ) > 3) ) { cell(i, j) = 0; } else { if ((ngb( i, j )) == 3) cell(i, j) = 1; } if (cell(i, j) == 1) { num_alive+=1; } } } printf("%d \n", num_alive); } /* Reassembles matrix into one from the sub blocks in the block */ MPI_Gatherv(&board[ldboard+1], 1, sub_block, &globboard2[1+ldglobalboard], process_count, cell_per_processes, block, 0, grid); /* Reduction to determine max time execution */ time = mytimer() - t1; MPI_Allreduce(&time, &final_time, 1,MPI_DOUBLE, MPI_MAX, grid); /* Reduction to determine number of cells still alive in all processes */ MPI_Allreduce(MPI_IN_PLACE, &num_alive, 1, MPI_INT, MPI_SUM, grid); /* The END */ if(!rank){ // Combien de cellules sont en PLS à la fin de la soirée ? printf("Final number of living cells = %d\n", num_alive); printf("time=%.2lf ms\n",(double)time * 1.e3); char str [100]; // create debug file sprintf(str, "mpi_debug_%d.dat", size); FILE *fd = NULL; fd=fopen(str, "w"); // JUST TELL ME IF IT WORKS !! if (fd != NULL) fprintf(fd,"%.2lf", time*1.e3); else exit(EXIT_FAILURE); fclose(fd); output_board( BS, &globboard2[1+ldglobalboard], ldglobalboard, loop_iterations); } // FREE ALL free(process_count); free(cell_per_processes); free(board); free(nbngb); MPI_Finalize(); // The final end return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { int i, j, loop, num_alive, maxloop; int ldgboard,ldboard, ldnbngb; double t1, t2; double temps; int *gboard; int *board; int *nbngb; int size; int coord[2], id; int procs_per_lines_col; MPI_Init(NULL,NULL); MPI_Comm_size(MPI_COMM_WORLD, &size); procs_per_lines_col = sqrt(size); if(procs_per_lines_col * procs_per_lines_col != size) { fprintf(stderr, "Renseignez un nombre carré de processeurs siouplait !\n"); MPI_Finalize(); exit(EXIT_FAILURE); } int dims[2]; dims[0] = procs_per_lines_col; dims[1] = procs_per_lines_col; int periods[2]; periods[0] = 1; periods[1] = 1; MPI_Comm comm_cart; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_cart); MPI_Comm_rank(comm_cart, &id); MPI_Cart_coords(comm_cart, id, 2, coord); if (argc < 3) { printf("Usage: %s nb_iterations size\n", argv[0]); return EXIT_SUCCESS; } else { maxloop = atoi(argv[1]); BS = atoi(argv[2]); //printf("Running sequential version, grid of size %d, %d iterations\n", BS, maxloop); } num_alive = 0; //Generate the neighbours table /* Leading dimension of the global board array */ ldgboard = BS + 2; /* Leading dimension of the board array */ ldboard = BS/procs_per_lines_col + 2; /* Leading dimension of the neigbour counters array */ ldnbngb = BS/procs_per_lines_col; board = malloc( ldboard * ldboard * sizeof(int) ); nbngb = malloc( ldnbngb * ldnbngb * sizeof(int) ); if(id == 0) { gboard = malloc(ldgboard * ldgboard * sizeof(int)); num_alive = generate_initial_board( BS, &gboard[1+ldgboard], ldgboard ); //fprintf(stderr,"Starting number of living cells = %d\n", num_alive); } MPI_Datatype block; MPI_Type_vector(ldboard-2, ldboard-2, ldgboard, MPI_INT, &block); MPI_Type_create_resized(block, 0, sizeof(int), &block); MPI_Type_commit(&block); MPI_Datatype subblock; MPI_Type_vector(ldboard-2, ldboard-2, ldboard, MPI_INT, &subblock); MPI_Type_create_resized(subblock, 0, sizeof(int), &subblock); MPI_Type_commit(&subblock); int * counts = (int*) malloc(size*sizeof(int)); int * displs = (int*) malloc(size*sizeof(int)); // Définition des déplacements pour chaque proc for (int i = 0; i < procs_per_lines_col; ++i) { for (int j = 0; j < procs_per_lines_col; ++j) { counts[i+j*procs_per_lines_col]= 1; displs[i+j*procs_per_lines_col]= i*ldgboard*(ldboard-2)+j*(ldboard-2); } } MPI_Scatterv(&gboard[1+ldgboard], counts, displs, block, &board[ldboard+1], 1, subblock,0, comm_cart); int neighbours[8]; make_neighbours_table(neighbours, comm_cart); MPI_Request req[8]; int block_size = ldboard - 2; MPI_Datatype block_line; MPI_Type_vector(block_size+2, 1, ldboard,MPI_INT, &block_line); MPI_Type_commit(&block_line); t1 = mytimer(); for (loop = 1; loop <= maxloop; loop++) { make_communications(req, comm_cart, neighbours, block_size, board, ldboard, block_line); /* cell( 0, 0 ) = cell(BS, BS); cell( 0, BS+1) = cell(BS, 1); cell(BS+1, 0 ) = cell( 1, BS); cell(BS+1, BS+1) = cell( 1, 1); for (i = 1; i <= BS; i++) { cell( i, 0) = cell( i, BS); cell( i, BS+1) = cell( i, 1); cell( 0, i) = cell(BS, i); cell(BS+1, i) = cell( 1, i); } */ //Inner cells for (j = 2; j <= block_size; j++) { for (i = 2; i <= block_size; i++) { ngb( i, j ) = cell( i-1, j-1 ) + cell( i, j-1 ) + cell( i+1, j-1 ) + cell( i-1, j ) + cell( i+1, j ) + cell( i-1, j+1 ) + cell( i, j+1 ) + cell( i+1, j+1 ); } } //On LEFT MPI_Wait(&req[0], MPI_STATUS_IGNORE); MPI_Wait(&req[4], MPI_STATUS_IGNORE); MPI_Wait(&req[6], MPI_STATUS_IGNORE); //CALCUL LIGNE GAUCHE for(j = 1; j <= block_size; j++) { ngb( 1, j ) = cell( 0, j-1 ) + cell( 1, j-1 ) + cell( 2, j-1 ) + cell( 0, j ) + cell( 2, j ) + cell( 0, j+1 ) + cell( 1, j+1 ) + cell( 2, j+1 ); } //On TOP MPI_Wait(&req[1], MPI_STATUS_IGNORE); MPI_Wait(&req[5], MPI_STATUS_IGNORE); //CALCUL LIGNE DESSUS for(i = 1; i <= block_size; i++) { ngb( i, 1 ) = cell( i - 1, 0) + cell( i, 0 ) + cell( i + 1, 0 ) + cell( i - 1, 1) + cell( i + 1, 1 ) + cell( i - 1, 2) + cell( i, 2 ) + cell( i + 1, 2 ); } //On RIGHT MPI_Wait(&req[2], MPI_STATUS_IGNORE); MPI_Wait(&req[7], MPI_STATUS_IGNORE); //CALCULER A DROITE for(j = 1; j <= block_size; j++) { ngb( block_size, j ) = cell( block_size - 1, j-1 ) + cell( block_size , j-1 ) + cell( block_size + 1, j-1 ) + cell( block_size - 1, j ) + cell( block_size + 1, j ) + cell( block_size - 1, j+1 ) + cell( block_size, j+1 ) + cell( block_size + 1, j+1 ); } //ON BOT MPI_Wait(&req[3], MPI_STATUS_IGNORE); //CALCULER EN BAS for(i = 1; i <= block_size; i++) { ngb( i, block_size ) = cell( i - 1, block_size - 1) + cell( i, block_size - 1 ) + cell( i + 1, block_size - 1 ) + cell( i - 1, block_size ) + cell( i + 1, block_size ) + cell( i - 1, block_size + 1 ) + cell( i, block_size + 1 ) + cell( i + 1, block_size + 1 ); } num_alive = 0; for (j = 1; j <= block_size; j++) { for (i = 1; i <= block_size; i++) { if ( (ngb( i, j ) < 2) || (ngb( i, j ) > 3) ) { cell(i, j) = 0; } else { if ((ngb( i, j )) == 3) cell(i, j) = 1; } if (cell(i, j) == 1) { num_alive ++; } } } /* Avec les celluls sur les bords (utile pour vérifier les comm MPI) */ /* output_board( BS+2, &(cell(0, 0)), ldboard, loop ); */ /* Avec juste les "vraies" cellules: on commence à l'élément (1,1) */ //output_board( BS, &(cell(1, 1)), ldboard, loop); //printf("%d cells are alive\n", num_alive); } MPI_Gatherv(&board[ldboard+1], 1, subblock,&gboard[ldgboard+1], counts,displs, block, 0, comm_cart); t2 = mytimer(); temps = t2 - t1; MPI_Allreduce(MPI_IN_PLACE,&temps, 1, MPI_DOUBLE, MPI_MAX, comm_cart); MPI_Allreduce(MPI_IN_PLACE,&num_alive, 1, MPI_INT, MPI_SUM, comm_cart); if(id == 0) { //printf("Final number of living cells = %d\n", num_alive); printf("%.2lf\n",(double)temps * 1.e3); } free(board); free(nbngb); MPI_Finalize(); return EXIT_SUCCESS; }
int main(int argc, char ** argv) { int ncid, dimid, varid; MPI_Init(&argc, &argv); MPI_Datatype vtype, rtype, usertype; MPI_Aint lb, extent; int userbufsz, *userbuf, *cmpbuf, i, errs=0; int count = 25; double pi = 3.14159; MPI_Offset start, acount; ncmpi_create(MPI_COMM_WORLD, "vectors.nc", NC_CLOBBER, MPI_INFO_NULL, &ncid); ncmpi_def_dim(ncid, "50k", 1024*50, &dimid); ncmpi_def_var(ncid, "vector", NC_DOUBLE, 1, &dimid, &varid); ncmpi_enddef(ncid); MPI_Type_vector(VECCOUNT, BLOCKLEN, STRIDE, MPI_INT, &vtype); MPI_Type_create_resized(vtype, 0, STRIDE*VECCOUNT*sizeof(int), &rtype); MPI_Type_contiguous(count, rtype, &usertype); MPI_Type_commit(&usertype); MPI_Type_free(&vtype); MPI_Type_free(&rtype); MPI_Type_get_extent(usertype, &lb, &extent); userbufsz = extent; userbuf = malloc(userbufsz); cmpbuf = calloc(userbufsz, 1); for (i=0; i< userbufsz/sizeof(int); i++) { userbuf[i] = pi*i; } start = 10; acount = count*12; ncmpi_begin_indep_data(ncid); ncmpi_put_vara(ncid, varid, &start, &acount, userbuf, 1, usertype); ncmpi_close(ncid); NC_CHECK(ncmpi_open(MPI_COMM_WORLD, "vectors.nc", NC_NOWRITE, MPI_INFO_NULL, &ncid)); ncmpi_begin_indep_data(ncid); NC_CHECK(ncmpi_inq_varid(ncid, "vector", &varid)); NC_CHECK(ncmpi_get_vara(ncid, varid, &start, &acount, cmpbuf, 1, usertype)); ncmpi_close(ncid); for (i=0; errs < 10 && i < acount; i++) { /* vector of 4,3,5, so skip 4th and 5th items of every block */ if (i%STRIDE >= BLOCKLEN) continue; if (userbuf[i] != cmpbuf[i]) { errs++; fprintf(stderr, "%d: expected 0x%x got 0x%x\n", i, userbuf[i], cmpbuf[i]); } } free(userbuf); free(cmpbuf); MPI_Type_free(&usertype); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, k; double start, end; /* Time array */ double time[9]; double comm_time = 0; double comp_time = 0; int chunkSize; MPI_Status status; /* Being used in FFT */ float data[N][N]; /* Being used in mm */ float input_1[N][N], input_2[N][N]; /* Local matrix for FFT */ float local_data[N][N]; /* World rank and processor, related to MPI_COMM_WORLD */ int world_id; int world_processor; /* Divided rank and processors for communication, related to taskcomm */ int task_id; int task_processor; /* A complex array storing the temp row to operate FFT */ complex temp_data[N]; /* Initialize rank and the number of processor for the MPI */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &world_id); MPI_Comm_size(MPI_COMM_WORLD, &world_processor); /* Initialize a new vector for distributing columns */ MPI_Datatype column, col; /* Column vector */ MPI_Type_vector(N, 1, N, MPI_FLOAT, &col); MPI_Type_commit(&col); MPI_Type_create_resized(col, 0, 1*sizeof(float), &column); MPI_Type_commit(&column); int task = world_id%4; MPI_Comm taskcomm; /* Split the MPI_COMM_WORLD */ MPI_Comm_split(MPI_COMM_WORLD, task, world_id, &taskcomm); MPI_Comm_rank(taskcomm, &task_id); MPI_Comm_size(taskcomm, &task_processor); /* Initialize inter communicators */ MPI_Comm t1_t3_comm, t2_t3_comm, t3_t4_comm; /* Calculate chunkSize */ chunkSize = N/task_processor; /* Get the start time of all program */ if(world_id == 0){ printf("2D convolution using MPI task and data parallelism\n"); start = MPI_Wtime(); } /* Each group completes work and send results by inter communicators */ if(task == 0){ // task 1 /* Create an inter communicator for task 1 and task 3 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 1, &t1_t3_comm); if(task_id == 0){ time[0] = MPI_Wtime(); /* Read file */ readIm1File(data); time[1] = MPI_Wtime(); printf("Group 1: Reading file 1_im1 takes %f s.\n", time[1] - time[0]); } /* Scatter data to local ranks */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Compute time for distributing data */ if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 1: Scattering 1_im1(row) to each processor takes %f s.\n", time[2] - time[1]); } /* Do 1_im1 2d FFT */ /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 1: FFT each row for 1_im1 takes %f s.\n", time[3] - time[2]); } /* Gather all the data of 1_im1 */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 1: Gathering all the data of 1_im1(row) takes %f s.\n", time[4] - time[3]); } /* Scatter all the data to column local data */ MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 1: Scattering 1_im1(column) to each processor takes %f s.\n", time[5] - time[4]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 1: FFT each column for 1_im1 takes %f s.\n", time[6] - time[5]); } MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); /* Compute time and distribute data to do matrix multiplication */ if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 1: Gathering all the data of 1_im1(column) takes %f s.\n", time[7] - time[6]); /* Total time */ printf("Group 1: Total time for task 1 in group 1 takes %f s.\n", time[7] - time[0]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1]; comp_time += time[6] - time[5] + time[3] - time[2]; /* Send data to group 3 via the inter communicator */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm); } } else if(task == 1){ // Task 2 /* Create an inter communicator for task 2 and task 3 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 2, &t2_t3_comm); if(task_id == 0){ time[0] = MPI_Wtime(); /* Read file */ readIm2File(data); time[1] = MPI_Wtime(); printf("Group 2: Reading file 1_im2 takes %f s.\n", time[1] - time[0]); } /* Scatter data to local ranks */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Compute time for distributing data */ if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 2: Scatter 1_im2(row) to each processor takes %f s.\n", time[2] - time[1]); } /* Do 1_im1 2d FFT */ /* Row FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } /* Gather all the data and distribute in columns */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 2: FFT each row for 1_im2 takes %f s.\n", time[3] - time[2]); } /* Gather all the data of 1_im1 */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 2: Gather all the data of 1_im2(row) takes %f s.\n", time[4] - time[3]); } /* Scatter all the data to column local data */ MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 2: Scatter 1_im2(column) to each processor takes %f s.\n", time[5] - time[4]); } /* Column FFT */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, -1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } /* Gather all the columns from each rank */ if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 2: FFT each column for 1_im2 takes %f s.\n", time[6] - time[5]); } MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); /* Compute time and distribute data to do matrix multiplication */ if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 2: Gather all the data of 1_im2(column) takes %f s.\n", time[7] - time[6]); /* Total time */ printf("Group 2: Total time for task 2 in group 2 takes %f s.\n", time[7] - time[0]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[1]; comp_time += time[6] - time[5] + time[3] - time[2]; /* Send data to group 3 via the inter communicator */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm); } } else if(task == 2){ // Task 3 /* Local matrix for matrix multiplication */ float local_data2[chunkSize][N]; /* Create inter communicators for task 1 and task3, task 2 and task 3, task 3 and task 4 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 0, 1, &t1_t3_comm); MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 1, 2, &t2_t3_comm); MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 3, 3, &t3_t4_comm); /* Receive data from group 1 and group 2 */ if(task_id == 0){ time[0] = MPI_Wtime(); MPI_Recv(input_1, N*N, MPI_FLOAT, task_id, 13, t1_t3_comm, &status); MPI_Recv(input_2, N*N, MPI_FLOAT, task_id, 23, t2_t3_comm, &status); time[1] = MPI_Wtime(); /* Time of receiving data from group 1 and group 2 */ printf("Group 3: Receiving data from group 1 and group 2 takes %f s.\n", time[1] - time[0]); } /* Do matrix multiplication */ MPI_Scatter(input_1, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); /* Broadcast data2 to all the ranks */ MPI_Bcast(input_2, N*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 3: Scattering data for multiplication takes %f s.\n", time[2] - time[1]); } /* Matrix multiplication */ for(i = 0; i < chunkSize; i++) for(j = 0; j < N; j++){ local_data2[i][j] = 0; for(k = 0; k < N; k++) local_data2[i][j] += local_data[i][k]*input_2[k][j]; } /* Collect multiplication result from each rank */ if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 3: Matrix multiplication takes %f s.\n", time[3] - time[2]); } /* Gather data */ MPI_Gather(local_data2, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 3: Gathering data after Matrix multiplication takes %f s.\n", time[4] - time[3]); /* total time */ printf("Group 3: Total time for task 3 in group 3 takes %f s.\n", time[4] - time[0]); /* send result of matrix multiplication to group 4 */ MPI_Send(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm); } comm_time += time[4] - time[3] + time[2] - time[0]; comp_time += time[3] - time[2]; MPI_Comm_free(&t1_t3_comm); MPI_Comm_free(&t2_t3_comm); } else{ // Task 4 /* Create an inter communicator for task 3 and task 4 */ MPI_Intercomm_create(taskcomm, 0, MPI_COMM_WORLD, 2, 3, &t3_t4_comm); /* Receive data from group 3 */ if(task_id == 0){ time[0] = MPI_Wtime(); MPI_Recv(data, N*N, MPI_FLOAT, task_id, 34, t3_t4_comm, &status); time[1] = MPI_Wtime(); printf("Group 4: Receiving data from group 3 takes %f s.\n", time[1] - time[0]); } /* Scatter data to each processor */ MPI_Scatter(data, chunkSize*N, MPI_FLOAT, local_data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[2] = MPI_Wtime(); printf("Group 4: Scattering data of rows to each processor takes %f s.\n", time[2] - time[1]); } /* Inverse-2DFFT(row) */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each row for im1 */ temp_data[j].r = local_data[i][j]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data[i][j] = temp_data[j].r; } if(task_id == 0){ time[3] = MPI_Wtime(); printf("Group 4: Inverse-2DFFT(row) takes %f s.\n", time[3] - time[2]); } /* Gather all the data */ MPI_Gather(local_data, chunkSize*N, MPI_FLOAT, data, chunkSize*N, MPI_FLOAT, 0, taskcomm); if(task_id == 0){ time[4] = MPI_Wtime(); printf("Group 4: Gathering data of Inverse-2DFFT(row) takes %f s.\n", time[4] - time[3]); } MPI_Scatter(data, chunkSize, column, local_data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[5] = MPI_Wtime(); printf("Group 4: Scattering data of columns to each processor takes %f s.\n", time[5] - time[4]); } /* Inverse-2DFFT(column) for output file */ for(i = 0; i < chunkSize; i++){ for(j = 0; j < N; j++){ /* FFT each column for im1 */ temp_data[j].r = local_data[j][i]; temp_data[j].i = 0; } c_fft1d(temp_data, N, 1); for(j = 0; j < N; j++) local_data[j][i] = temp_data[j].r; } if(task_id == 0){ time[6] = MPI_Wtime(); printf("Group 4: Inverse-2DFFT(column) takes %f s.\n", time[6] - time[5]); } /* Gather all the columns of output file from each rank */ MPI_Gather(local_data, chunkSize, column, data, chunkSize, column, 0, taskcomm); if(task_id == 0){ time[7] = MPI_Wtime(); printf("Group 4: Gathering data of Inverse-2DFFT(column) takes %f s.\n", time[7] - time[6]); writeFile(data); time[8] = MPI_Wtime(); printf("Group 4: Writing file to out_1 takes %f s.\n", time[8] - time[7]); comm_time += time[7] - time[6] + time[5] - time[3] + time[2] - time[0]; comp_time += time[6] - time[5] + time[3] - time[2]; } MPI_Comm_free(&t3_t4_comm); } MPI_Barrier(MPI_COMM_WORLD); if(world_id == 0){ end = MPI_Wtime(); printf("Total communication time of 2D convolution using MPI task parallel takes %f s.\n", comm_time); printf("Total computing time of 2D convolution using MPI task parallel takes %f s.\n", comp_time); printf("Total running time without loading/writing of 2D convolution using MPI task parallel takes %f s.\n", comm_time + comp_time); printf("Total running time of 2D convolution using MPI task parallel takes %f s.\n", end - start); } /* Free vector and task comm */ MPI_Type_free(&column); MPI_Type_free(&col); MPI_Comm_free(&taskcomm); MPI_Finalize(); return 0; }