void mmm2d_comm_init(mmm2d_comm_struct *comm, MPI_Comm communicator) { /* store the original communicator */ comm->mpicomm_orig = communicator; MPI_Comm_size(communicator, &comm->size); /* Test whether the communicator is cartesian and correct dimensionality */ fcs_int comm_is_cart = 0; fcs_int status; MPI_Topo_test(communicator, &status); if (status == MPI_CART) { /* Communicator is cartesian, so test dimensionality */ fcs_int ndims; MPI_Cartdim_get(communicator, &ndims); if (ndims == 3) { /* Correct dimensionality, so get grid and test periodicity */ fcs_int periodicity[3]; MPI_Cart_get(communicator, 3, comm->node_grid, periodicity, comm->node_pos); if (periodicity[0] && periodicity[1] && periodicity[2]) { /* If periodicity is correct, we can just use this communicator */ comm->mpicomm = communicator; /* get the rank */ MPI_Comm_rank(communicator, &comm->rank); comm_is_cart = 1; } } } /* otherwise, we have to set up the cartesian communicator */ if (!comm_is_cart) { comm->node_grid[0] = 0.0; comm->node_grid[1] = 0.0; comm->node_grid[2] = 0.0; /* compute node grid */ MPI_Dims_create(comm->size, 3, comm->node_grid); /* create communicator */ fcs_int periodicity[3] = {1, 1, 0}; MPI_Cart_create(comm->mpicomm_orig, 3, comm->node_grid, periodicity, 1, &comm->mpicomm); /* get the rank */ MPI_Comm_rank(communicator, &comm->rank); /* get node pos */ MPI_Cart_coords(comm->mpicomm, comm->rank, 3, comm->node_pos); } }
void initialize(field * temperature1, field * temperature2, parallel_data * parallel) { int i, j; int dims[2], coords[2], periods[2]; // Allocate also ghost layers temperature1->data = malloc_2d(temperature1->nx + 2, temperature1->ny + 2); temperature2->data = malloc_2d(temperature2->nx + 2, temperature2->ny + 2); // Initialize to zero memset(temperature1->data[0], 0.0, (temperature1->nx + 2) * (temperature1->ny + 2) * sizeof(double)); MPI_Cart_get(parallel->comm, 2, dims, periods, coords); // Left boundary if (coords[1] == 0) for (i = 0; i < temperature1->nx + 2; i++) temperature1->data[i][0] = 30.0; // Upper boundary if (coords[0] == 0) for (j = 0; j < temperature1->ny + 2; j++) temperature1->data[0][j] = 15.0; // Right boundary if (coords[1] == dims[1] - 1) for (i = 0; i < temperature1->nx + 2; i++) temperature1->data[i][temperature1->ny + 1] = -10.0; // Lower boundary if (coords[0] == dims[0] - 1) for (j = 0; j < temperature1->ny + 2; j++) temperature1->data[temperature1->nx + 1][j] = -25.0; copy_field(temperature1, temperature2); }
static void comm_get_periodicity( MPI_Comm comm, fcs_int *periodicity ) { int dims[3], periods[3], coords[3]; /* default: no periodicity given */ for(int t=0; t<3; t++) periodicity[t] = -1; if( !comm_is_cart_3d(comm) ) return; /* for 3d cart comm use periodcity of comm */ MPI_Cart_get(comm, 3, dims, periods, coords); for(int t=0; t<3; t++) periodicity[t] = periods[t]; }
int dfft_get_local_size_t(int N0, int N1, int tuple, int * isize, int * istart, MPI_Comm c_comm) { int nprocs, procid; MPI_Comm_rank(c_comm, &procid); int coords[2], np[2], periods[2]; MPI_Cart_get(c_comm, 2, np, periods, coords); isize[2] = tuple; isize[0] = ceil(N0 / (double) np[0]); isize[1] = ceil(N1 / (double) np[1]); istart[0] = isize[0] * (coords[0]); istart[1] = isize[1] * (coords[1]); istart[2] = 0; if ((N0 - isize[0] * coords[0]) < isize[0]) { isize[0] = N0 - isize[0] * coords[0]; isize[0] *= (int) isize[0] > 0; istart[0] = N0 - isize[0]; } if ((N1 - isize[1] * coords[1]) < isize[1]) { isize[1] = N1 - isize[1] * coords[1]; isize[1] *= (int) isize[1] > 0; istart[1] = N1 - isize[1]; } #ifdef VERBOSE2 if (VERBOSE >= 2) { for (int r = 0; r < np[0]; r++) for (int c = 0; c < np[1]; c++) { if ((coords[0] == r) && (coords[1] == c)) std::cout << coords[0] << "," << coords[1] << " isize[0]= " << isize[0] << " isize[1]= " << isize[1] << " isize[2]= " << isize[2] << " istart[0]= " << istart[0] << " istart[1]= " << istart[1] << " istart[2]= " << istart[2] << std::endl; } } #endif int alloc_local = isize[0] * isize[1] * isize[2] * sizeof(T); return alloc_local; }
static void set_data_2D ( double **data, int rank, int *dim, int hwidth, MPI_Comm cart_comm ) { int i, j; int coords[2]; int dims[2]; //size of each dimension int period[2]; //Get the coordinate of the process //MPI_Cart_coords(cart_comm, rank, 2, coords); MPI_Cart_get(cart_comm, 2, dims, period, coords); for (i=0; i<dim[0]; i++ ) { for (j=0; j<hwidth; j++ ){ data[i][j] =-1; } for (j=dim[1]-hwidth; j<dim[1]; j++ ) { data[i][j] =-1; } } for ( j=0; j<dim[1]; j++ ) { for ( i=0; i<hwidth; i++ ) { data[i][j]=-1; } for ( i=dim[0]-hwidth; i<dim[0]; i++ ) { data[i][j]=-1; } } for ( i=hwidth; i<dim[0]-hwidth; i++) { for (j=hwidth; j<dim[1]-hwidth; j++ ){ data[i][j] = (coords[0] * (dim[0]-hwidth*2) + (i-hwidth) ) + (dims[0] * (dim[0] - hwidth*2)) * ((dim[1] - hwidth*2) * coords[1] + (j-hwidth)); } } return; }
static int check_data_2D ( double **data, int rank, int *dim, int hwidth, int *neighbors, MPI_Comm cart_comm) { int i, j, lres=1, gres, cumres = 1; double should_be; int coords[2], n_coords[2], c_coords[2]; int cart_dims[2]; //size of each dimension int period[2]; MPI_Cart_get(cart_comm, 2, cart_dims, period, coords); //Up HALO Cell if(neighbors[0] != MPI_PROC_NULL){ MPI_Cart_coords (cart_comm, neighbors[0], 2, n_coords); } for ( j = hwidth; j < dim[1] - hwidth; j++ ) { for ( i= dim[0] - hwidth*2; i < dim[0]-hwidth ; i++ ) { should_be = calc_entry (i, j, neighbors[0] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); check_entry2D ( data, i-(dim[0]-hwidth*2), j, should_be, "out1", &lres ); if ( lres == 0 ) cumres = 0; } } //Down HALO Cell if(neighbors[1] != MPI_PROC_NULL){ MPI_Cart_coords (cart_comm, neighbors[1], 2, n_coords); } for ( j = hwidth; j < dim[1] - hwidth; j++ ) { for ( i= hwidth; i < hwidth * 2 ; i++ ) { should_be = calc_entry (i, j, neighbors[1] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); check_entry2D ( data, i+(dim[0]-hwidth*2), j, should_be, "out1", &lres ); if ( lres == 0 ) cumres = 0; } } //Left HALO Cell if(neighbors[2]!=MPI_PROC_NULL){ MPI_Cart_coords (cart_comm, neighbors[2], 2, n_coords); } for ( i = hwidth; i < dim[0] - hwidth; i++ ) { for ( j= dim[1] - hwidth*2; j < dim[1]-hwidth ; j++ ) { should_be = calc_entry (i, j, neighbors[2] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); check_entry2D ( data, i, j-(dim[1]-hwidth*2), should_be, "out2", &lres ); if ( lres == 0 ) cumres = 0; } } //Right HALO Cell if(neighbors[3]!=MPI_PROC_NULL){ MPI_Cart_coords (cart_comm, neighbors[3], 2, n_coords); } for ( i = hwidth; i < dim[0] - hwidth; i++ ) { for ( j= hwidth; j < hwidth * 2 ; j++ ) { should_be = calc_entry (i, j, neighbors[3] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); check_entry2D ( data, i, j+(dim[1]-hwidth*2), should_be, "out2", &lres ); if ( lres == 0 ) cumres = 0; } } //Inside for ( i=hwidth; i<dim[0]-hwidth; i++) { for (j=hwidth; j<dim[1]-hwidth; j++ ){ should_be = calc_entry (i, j, 1, dim, cart_dims, coords, hwidth ); check_entry2D ( data, i, j, should_be, "inside", &lres ); if ( lres == 0 ) cumres = 0; } } //Up-Left Corner if((neighbors[0]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL)){ MPI_Cart_coords (cart_comm, neighbors[0], 2, n_coords); c_coords[0] = n_coords[0]; MPI_Cart_coords (cart_comm, neighbors[2], 2, n_coords); c_coords[1] = n_coords[1]; } for ( i= dim[0] - hwidth*2; i < dim[0]-hwidth ; i++ ) { for ( j= dim[1] - hwidth*2; j < dim[1]-hwidth ; j++ ) { should_be = calc_entry (i, j, (neighbors[0]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL), dim, cart_dims, c_coords, hwidth ); check_entry2D ( data, i-(dim[0]-hwidth*2), j-(dim[1]-hwidth*2), should_be, "corner1", &lres ); if ( lres == 0 ) cumres = 0; } } //Up-right Corner if ((neighbors[0]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL)){ MPI_Cart_coords (cart_comm, neighbors[0], 2, n_coords); c_coords[0] = n_coords[0]; MPI_Cart_coords (cart_comm, neighbors[3], 2, n_coords); c_coords[1] = n_coords[1]; } for ( i= dim[0] - hwidth*2; i < dim[0]-hwidth ; i++ ) { for ( j= hwidth ; j < hwidth * 2 ; j++ ) { should_be = calc_entry (i, j, (neighbors[0]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL), dim, cart_dims, c_coords, hwidth ); check_entry2D ( data, i-(dim[0]-hwidth*2), j+(dim[1]-hwidth*2), should_be, "corner2", &lres ); if ( lres == 0 ) cumres = 0; } } //Down-left Corner if((neighbors[1]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL)){ MPI_Cart_coords (cart_comm, neighbors[1], 2, n_coords); c_coords[0] = n_coords[0]; MPI_Cart_coords (cart_comm, neighbors[2], 2, n_coords); c_coords[1] = n_coords[1]; } for ( i= hwidth; i < hwidth * 2; i++ ) { for ( j= dim[1] - hwidth*2; j < dim[1]-hwidth ; j++ ) { should_be = calc_entry (i, j, (neighbors[1]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL), dim, cart_dims, c_coords, hwidth ); check_entry2D ( data, i+(dim[0]-hwidth*2), j-(dim[1]-hwidth*2), should_be, "corner3", &lres ); if ( lres == 0 ) cumres = 0; } } //Down-right Corner if((neighbors[1]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL)){ MPI_Cart_coords (cart_comm, neighbors[1], 2, n_coords); c_coords[0] = n_coords[0]; MPI_Cart_coords (cart_comm, neighbors[3], 2, n_coords); c_coords[1] = n_coords[1]; } for ( i= hwidth; i < hwidth * 2; i++ ) { for ( j= hwidth ; j < hwidth * 2 ; j++ ) { should_be = calc_entry (i, j, (neighbors[1]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL), dim, cart_dims, c_coords, hwidth ); check_entry2D ( data, i+(dim[0]-hwidth*2), j+(dim[1]-hwidth*2), should_be, "corner4", &lres ); if ( lres == 0 ) cumres = 0; } } MPI_Allreduce ( &cumres, &gres, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD ); if ( gres != 1 ) { return 0; } return 1; }
int main( int argc, char **argv ) { int rank, size, i; int errors=0; int dims[NUM_DIMS]; int periods[NUM_DIMS]; int coords[NUM_DIMS]; int new_coords[NUM_DIMS]; int reorder = 1; MPI_Comm comm_temp, comm_cart, new_comm; int topo_status; int ndims; int new_rank; int remain_dims[NUM_DIMS]; int newnewrank; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); /* Clear dims array and get dims for topology */ for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; } MPI_Dims_create ( size, NUM_DIMS, dims ); /* Make a new communicator with a topology */ MPI_Cart_create ( MPI_COMM_WORLD, 2, dims, periods, reorder, &comm_temp ); MPI_Comm_dup ( comm_temp, &comm_cart ); /* Determine the status of the new communicator */ MPI_Topo_test ( comm_cart, &topo_status ); if (topo_status != MPI_CART) { printf( "topo_status of duped comm is not MPI_CART\n" ); errors++; } /* How many dims do we have? */ MPI_Cartdim_get( comm_cart, &ndims ); if ( ndims != NUM_DIMS ) { printf( "Number of dims of duped comm (%d) should be %d\n", ndims, NUM_DIMS ); errors++; } /* Get the topology, does it agree with what we put in? */ for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; } MPI_Cart_get ( comm_cart, NUM_DIMS, dims, periods, coords ); /* Does the mapping from coords to rank work? */ MPI_Cart_rank ( comm_cart, coords, &new_rank ); if ( new_rank != rank ) { printf( "New rank of duped comm (%d) != old rank (%d)\n", new_rank, rank ); errors++; } /* Does the mapping from rank to coords work */ MPI_Cart_coords ( comm_cart, rank, NUM_DIMS, new_coords ); for (i=0;i<NUM_DIMS;i++) if ( coords[i] != new_coords[i] ) { printf( "Old coords[%d] of duped comm (%d) != new_coords (%d)\n", i, coords[i], new_coords[i] ); errors++; } /* Let's shift in each dimension and see how it works! */ /* Because it's late and I'm tired, I'm not making this */ /* automatically test itself. */ for (i=0;i<NUM_DIMS;i++) { int source, dest; MPI_Cart_shift(comm_cart, i, 1, &source, &dest); #ifdef VERBOSE printf ("[%d] Shifting %d in the %d dimension\n",rank,1,i); printf ("[%d] source = %d dest = %d\n",rank,source,dest); #endif } /* Subdivide */ remain_dims[0] = 0; for (i=1; i<NUM_DIMS; i++) remain_dims[i] = 1; MPI_Cart_sub ( comm_cart, remain_dims, &new_comm ); /* Determine the status of the new communicator */ MPI_Topo_test ( new_comm, &topo_status ); if (topo_status != MPI_CART) { printf( "topo_status of cartsub comm is not MPI_CART\n" ); errors++; } /* How many dims do we have? */ MPI_Cartdim_get( new_comm, &ndims ); if ( ndims != NUM_DIMS-1 ) { printf( "Number of dims of cartsub comm (%d) should be %d\n", ndims, NUM_DIMS-1 ); errors++; } /* Get the topology, does it agree with what we put in? */ for(i=0;i<NUM_DIMS-1;i++) { dims[i] = 0; periods[i] = 0; } MPI_Cart_get ( new_comm, ndims, dims, periods, coords ); /* Does the mapping from coords to rank work? */ MPI_Comm_rank ( new_comm, &newnewrank ); MPI_Cart_rank ( new_comm, coords, &new_rank ); if ( new_rank != newnewrank ) { printf( "New rank of cartsub comm (%d) != old rank (%d)\n", new_rank, newnewrank ); errors++; } /* Does the mapping from rank to coords work */ MPI_Cart_coords ( new_comm, new_rank, NUM_DIMS -1, new_coords ); for (i=0;i<NUM_DIMS-1;i++) if ( coords[i] != new_coords[i] ) { printf( "Old coords[%d] of cartsub comm (%d) != new_coords (%d)\n", i, coords[i], new_coords[i] ); errors++; } /* We're at the end */ MPI_Comm_free( &new_comm ); MPI_Comm_free( &comm_temp ); MPI_Comm_free( &comm_cart ); Test_Waitforall( ); if (errors) printf( "[%d] done with %d ERRORS!\n", rank,errors ); MPI_Finalize(); return 0; }
FC_FUNC( mpi_cart_get , MPI_CART_GET ) (int * comm, int * maxdims, int * dims, int * periods, int * coords, int * ierr) { *ierr = MPI_Cart_get(*comm, *maxdims, dims, periods, coords); }
int main(int argc, char **argv) { int rank, size, myrow, mycol, nx, ny, stride, cnt, i, j, errs, errs_in_place, tot_errs; double *sendbuf, *recvbuf; MPI_Datatype vec, block, types[2]; MPI_Aint displs[2]; int *scdispls; int blens[2]; MPI_Comm comm2d; int dims[2], periods[2], coords[2], lcoords[2]; int *sendcounts; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); /* Get a 2-d decomposition of the processes */ dims[0] = 0; dims[1] = 0; MPI_Dims_create(size, 2, dims); periods[0] = 0; periods[1] = 0; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm2d); MPI_Cart_get(comm2d, 2, dims, periods, coords); myrow = coords[0]; mycol = coords[1]; /* if (rank == 0) printf("Decomposition is [%d x %d]\n", dims[0], dims[1]); */ /* Get the size of the matrix */ nx = 10; ny = 8; stride = nx * dims[0]; recvbuf = (double *) malloc(nx * ny * sizeof(double)); if (!recvbuf) { MPI_Abort(MPI_COMM_WORLD, 1); } sendbuf = 0; if (myrow == 0 && mycol == 0) { sendbuf = (double *) malloc(nx * ny * size * sizeof(double)); if (!sendbuf) { MPI_Abort(MPI_COMM_WORLD, 1); } } sendcounts = (int *) malloc(size * sizeof(int)); scdispls = (int *) malloc(size * sizeof(int)); MPI_Type_vector(ny, nx, stride, MPI_DOUBLE, &vec); blens[0] = 1; blens[1] = 1; types[0] = vec; types[1] = MPI_UB; displs[0] = 0; displs[1] = nx * sizeof(double); MPI_Type_struct(2, blens, displs, types, &block); MPI_Type_free(&vec); MPI_Type_commit(&block); /* Set up the transfer */ cnt = 0; for (i = 0; i < dims[1]; i++) { for (j = 0; j < dims[0]; j++) { sendcounts[cnt] = 1; /* Using Cart_coords makes sure that ranks (used by * sendrecv) matches the cartesian coordinates (used to * set data in the matrix) */ MPI_Cart_coords(comm2d, cnt, 2, lcoords); scdispls[cnt++] = lcoords[0] + lcoords[1] * (dims[0] * ny); } } SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]); MPI_Scatterv(sendbuf, sendcounts, scdispls, block, recvbuf, nx * ny, MPI_DOUBLE, 0, comm2d); if ((errs = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], 0))) { fprintf(stdout, "Failed to transfer data\n"); } /* once more, but this time passing MPI_IN_PLACE for the root */ SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]); MPI_Scatterv(sendbuf, sendcounts, scdispls, block, (rank == 0 ? MPI_IN_PLACE : recvbuf), nx * ny, MPI_DOUBLE, 0, comm2d); errs_in_place = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], (rank == 0)); if (errs_in_place) { fprintf(stdout, "Failed to transfer data (MPI_IN_PLACE)\n"); } errs += errs_in_place; MPI_Allreduce(&errs, &tot_errs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (tot_errs == 0) printf(" No Errors\n"); else printf("%d errors in use of MPI_SCATTERV\n", tot_errs); } if (sendbuf) free(sendbuf); free(recvbuf); free(sendcounts); free(scdispls); MPI_Type_free(&block); MPI_Comm_free(&comm2d); MPI_Finalize(); return errs; }
int main(int argc, char** argv) { MPI_Init(&argc, &argv); setup_globals(); /* Parse arguments. */ int SCALE = 16; int edgefactor = 16; /* nedges / nvertices, i.e., 2*avg. degree */ // if (argc >= 2) SCALE = atoi(argv[1]); // if (argc >= 3) edgefactor = atoi(argv[2]); char* name = argv[1]; if (argc >= 3) SCALE = atoi(argv[2]); if (argc >= 4) edgefactor = atoi(argv[3]); // if (argc <= 1 || argc >= 4 || SCALE == 0 || edgefactor == 0) { // if (rank == 0) { // fprintf(stderr, "Usage: %s SCALE edgefactor\n SCALE = log_2(# vertices) [integer, required]\n edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]); // } if (argc <= 2 || argc >= 5 || SCALE == 0 || edgefactor == 0) { if (rank == 0) { fprintf(stderr, "Usage: %s filename SCALE edgefactor\n SCALE = log_2(# vertices) [integer, required]\n edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]); } MPI_Abort(MPI_COMM_WORLD, 1); } uint64_t seed1 = 2, seed2 = 3; // const char* filename = getenv("TMPFILE"); const char* filename = name; /* If filename is NULL, store data in memory */ tuple_graph tg; tg.nglobaledges = (int64_t)(edgefactor) << SCALE; int64_t nglobalverts = (int64_t)(1) << SCALE; tg.data_in_file = (filename != NULL); if (tg.data_in_file) { printf("data in file \n"); MPI_File_set_errhandler(MPI_FILE_NULL, MPI_ERRORS_ARE_FATAL); // MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_DELETE_ON_CLOSE | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile); MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile); MPI_File_set_size(tg.edgefile, tg.nglobaledges * sizeof(packed_edge)); MPI_File_set_view(tg.edgefile, 0, packed_edge_mpi_type, packed_edge_mpi_type, "native", MPI_INFO_NULL); MPI_File_set_atomicity(tg.edgefile, 0); } /* Make the raw graph edges. */ /* Get roots for BFS runs, plus maximum vertex with non-zero degree (used by * validator). */ int num_bfs_roots = 64; int64_t* bfs_roots = (int64_t*)xmalloc(num_bfs_roots * sizeof(int64_t)); int64_t max_used_vertex = 0; double make_graph_start = MPI_Wtime(); { /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(seed1, seed2, seed); /* As the graph is being generated, also keep a bitmap of vertices with * incident edges. We keep a grid of processes, each row of which has a * separate copy of the bitmap (distributed among the processes in the * row), and then do an allreduce at the end. This scheme is used to avoid * non-local communication and reading the file separately just to find BFS * roots. */ MPI_Offset nchunks_in_file = (tg.nglobaledges + FILE_CHUNKSIZE - 1) / FILE_CHUNKSIZE; int64_t bitmap_size_in_bytes = int64_min(BITMAPSIZE, (nglobalverts + CHAR_BIT - 1) / CHAR_BIT); if (bitmap_size_in_bytes * size * CHAR_BIT < nglobalverts) { bitmap_size_in_bytes = (nglobalverts + size * CHAR_BIT - 1) / (size * CHAR_BIT); } int ranks_per_row = ((nglobalverts + CHAR_BIT - 1) / CHAR_BIT + bitmap_size_in_bytes - 1) / bitmap_size_in_bytes; int nrows = size / ranks_per_row; int my_row = -1, my_col = -1; unsigned char* restrict has_edge = NULL; MPI_Comm cart_comm; { int dims[2] = {size / ranks_per_row, ranks_per_row}; int periods[2] = {0, 0}; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &cart_comm); } int in_generating_rectangle = 0; if (cart_comm != MPI_COMM_NULL) { in_generating_rectangle = 1; { int dims[2], periods[2], coords[2]; MPI_Cart_get(cart_comm, 2, dims, periods, coords); my_row = coords[0]; my_col = coords[1]; } MPI_Comm this_col; MPI_Comm_split(cart_comm, my_col, my_row, &this_col); MPI_Comm_free(&cart_comm); has_edge = (unsigned char*)xMPI_Alloc_mem(bitmap_size_in_bytes); memset(has_edge, 0, bitmap_size_in_bytes); /* Every rank in a given row creates the same vertices (for updating the * bitmap); only one writes them to the file (or final memory buffer). */ packed_edge* buf = (packed_edge*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge)); MPI_Offset block_limit = (nchunks_in_file + nrows - 1) / nrows; // fprintf(stderr, "%d: nchunks_in_file = %" PRId64 ", block_limit = %" PRId64 " in grid of %d rows, %d cols\n", rank, (int64_t)nchunks_in_file, (int64_t)block_limit, nrows, ranks_per_row); if (tg.data_in_file) { tg.edgememory_size = 0; tg.edgememory = NULL; } else { int my_pos = my_row + my_col * nrows; int last_pos = (tg.nglobaledges % ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row) != 0) ? (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row) : -1; int64_t edges_left = tg.nglobaledges % FILE_CHUNKSIZE; int64_t nedges = FILE_CHUNKSIZE * (tg.nglobaledges / ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row)) + FILE_CHUNKSIZE * (my_pos < (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row)) + (my_pos == last_pos ? edges_left : 0); /* fprintf(stderr, "%d: nedges = %" PRId64 " of %" PRId64 "\n", rank, (int64_t)nedges, (int64_t)tg.nglobaledges); */ tg.edgememory_size = nedges; tg.edgememory = (packed_edge*)xmalloc(nedges * sizeof(packed_edge)); } MPI_Offset block_idx; for (block_idx = 0; block_idx < block_limit; ++block_idx) { /* fprintf(stderr, "%d: On block %d of %d\n", rank, (int)block_idx, (int)block_limit); */ MPI_Offset start_edge_index = int64_min(FILE_CHUNKSIZE * (block_idx * nrows + my_row), tg.nglobaledges); MPI_Offset edge_count = int64_min(tg.nglobaledges - start_edge_index, FILE_CHUNKSIZE); packed_edge* actual_buf = (!tg.data_in_file && block_idx % ranks_per_row == my_col) ? tg.edgememory + FILE_CHUNKSIZE * (block_idx / ranks_per_row) : buf; /* fprintf(stderr, "%d: My range is [%" PRId64 ", %" PRId64 ") %swriting into index %" PRId64 "\n", rank, (int64_t)start_edge_index, (int64_t)(start_edge_index + edge_count), (my_col == (block_idx % ranks_per_row)) ? "" : "not ", (int64_t)(FILE_CHUNKSIZE * (block_idx / ranks_per_row))); */ if (!tg.data_in_file && block_idx % ranks_per_row == my_col) { assert (FILE_CHUNKSIZE * (block_idx / ranks_per_row) + edge_count <= tg.edgememory_size); } // debug char* wtxbuf = (char*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge)); // generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf); generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf); if (tg.data_in_file && my_col == (block_idx % ranks_per_row)) { /* Try to spread writes among ranks */ // MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE); // debug printf("%d: %d, %d\n", rank, start_edge_index, edge_count); int i; // for (i = start_edge_index; i < start_edge_index + 3; i++) { // if(block_idx == 0) { // for (i = 0; i < 3; i++) { // if (edge_count > 3) // printf("%d: %d\t%d\n", rank, actual_buf[i].v0, actual_buf[i].v1); // } // } MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE); } ptrdiff_t i; #ifdef _OPENMP #pragma omp parallel for #endif for (i = 0; i < edge_count; ++i) { int64_t src = get_v0_from_edge(&actual_buf[i]); int64_t tgt = get_v1_from_edge(&actual_buf[i]); if (src == tgt) continue; if (src / bitmap_size_in_bytes / CHAR_BIT == my_col) { #ifdef _OPENMP #pragma omp atomic #endif has_edge[(src / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (src % CHAR_BIT)); } if (tgt / bitmap_size_in_bytes / CHAR_BIT == my_col) { #ifdef _OPENMP #pragma omp atomic #endif has_edge[(tgt / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (tgt % CHAR_BIT)); } } } free(buf); #if 0 /* The allreduce for each root acts like we did this: */ MPI_Allreduce(MPI_IN_PLACE, has_edge, bitmap_size_in_bytes, MPI_UNSIGNED_CHAR, MPI_BOR, this_col); #endif MPI_Comm_free(&this_col); } else { tg.edgememory = NULL; tg.edgememory_size = 0; } MPI_Allreduce(&tg.edgememory_size, &tg.max_edgememory_size, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); #ifndef GEN_ONLY /* Find roots and max used vertex */ { uint64_t counter = 0; int bfs_root_idx; for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) { int64_t root; while (1) { double d[2]; make_random_numbers(2, seed1, seed2, counter, d); root = (int64_t)((d[0] + d[1]) * nglobalverts) % nglobalverts; counter += 2; if (counter > 2 * nglobalverts) break; int is_duplicate = 0; int i; for (i = 0; i < bfs_root_idx; ++i) { if (root == bfs_roots[i]) { is_duplicate = 1; break; } } if (is_duplicate) continue; /* Everyone takes the same path here */ int root_ok = 0; if (in_generating_rectangle && (root / CHAR_BIT / bitmap_size_in_bytes) == my_col) { root_ok = (has_edge[(root / CHAR_BIT) % bitmap_size_in_bytes] & (1 << (root % CHAR_BIT))) != 0; } MPI_Allreduce(MPI_IN_PLACE, &root_ok, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD); if (root_ok) break; } bfs_roots[bfs_root_idx] = root; } num_bfs_roots = bfs_root_idx; /* Find maximum non-zero-degree vertex. */ { int64_t i; max_used_vertex = 0; if (in_generating_rectangle) { for (i = bitmap_size_in_bytes * CHAR_BIT; i > 0; --i) { if (i > nglobalverts) continue; if (has_edge[(i - 1) / CHAR_BIT] & (1 << ((i - 1) % CHAR_BIT))) { max_used_vertex = (i - 1) + my_col * CHAR_BIT * bitmap_size_in_bytes; break; } } } MPI_Allreduce(MPI_IN_PLACE, &max_used_vertex, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD); } } #endif if (in_generating_rectangle) { MPI_Free_mem(has_edge); } if (tg.data_in_file) { MPI_File_sync(tg.edgefile); } } double make_graph_stop = MPI_Wtime(); double make_graph_time = make_graph_stop - make_graph_start; if (rank == 0) { /* Not an official part of the results */ fprintf(stderr, "graph_generation: %f s\n", make_graph_time); } //debug #ifndef GEN_ONLY //!GEN_ONLY /* Make user's graph data structure. */ double data_struct_start = MPI_Wtime(); make_graph_data_structure(&tg); double data_struct_stop = MPI_Wtime(); double data_struct_time = data_struct_stop - data_struct_start; if (rank == 0) { /* Not an official part of the results */ fprintf(stderr, "construction_time: %f s\n", data_struct_time); } /* Number of edges visited in each BFS; a double so get_statistics can be * used directly. */ double* edge_counts = (double*)xmalloc(num_bfs_roots * sizeof(double)); /* Run BFS. */ int validation_passed = 1; double* bfs_times = (double*)xmalloc(num_bfs_roots * sizeof(double)); double* validate_times = (double*)xmalloc(num_bfs_roots * sizeof(double)); uint64_t nlocalverts = get_nlocalverts_for_pred(); int64_t* pred = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t)); int bfs_root_idx; for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) { int64_t root = bfs_roots[bfs_root_idx]; if (rank == 0) fprintf(stderr, "Running BFS %d\n", bfs_root_idx); /* Clear the pred array. */ memset(pred, 0, nlocalverts * sizeof(int64_t)); /* Do the actual BFS. */ double bfs_start = MPI_Wtime(); run_bfs(root, &pred[0]); double bfs_stop = MPI_Wtime(); bfs_times[bfs_root_idx] = bfs_stop - bfs_start; if (rank == 0) fprintf(stderr, "Time for BFS %d is %f\n", bfs_root_idx, bfs_times[bfs_root_idx]); /* Validate result. */ if (rank == 0) fprintf(stderr, "Validating BFS %d\n", bfs_root_idx); double validate_start = MPI_Wtime(); int64_t edge_visit_count; int validation_passed_one = validate_bfs_result(&tg, max_used_vertex + 1, nlocalverts, root, pred, &edge_visit_count); double validate_stop = MPI_Wtime(); validate_times[bfs_root_idx] = validate_stop - validate_start; if (rank == 0) fprintf(stderr, "Validate time for BFS %d is %f\n", bfs_root_idx, validate_times[bfs_root_idx]); edge_counts[bfs_root_idx] = (double)edge_visit_count; if (rank == 0) fprintf(stderr, "TEPS for BFS %d is %g\n", bfs_root_idx, edge_visit_count / bfs_times[bfs_root_idx]); if (!validation_passed_one) { validation_passed = 0; if (rank == 0) fprintf(stderr, "Validation failed for this BFS root; skipping rest.\n"); break; } } MPI_Free_mem(pred); free(bfs_roots); free_graph_data_structure(); #endif //!GEN_ONLY if (tg.data_in_file) { MPI_File_close(&tg.edgefile); } else { free(tg.edgememory); tg.edgememory = NULL; } #ifndef GEN_ONLY /* Print results. */ if (rank == 0) { if (!validation_passed) { fprintf(stdout, "No results printed for invalid run.\n"); } else { int i; fprintf(stdout, "SCALE: %d\n", SCALE); fprintf(stdout, "edgefactor: %d\n", edgefactor); fprintf(stdout, "NBFS: %d\n", num_bfs_roots); fprintf(stdout, "graph_generation: %g\n", make_graph_time); fprintf(stdout, "num_mpi_processes: %d\n", size); fprintf(stdout, "construction_time: %g\n", data_struct_time); double stats[s_LAST]; get_statistics(bfs_times, num_bfs_roots, stats); fprintf(stdout, "min_time: %g\n", stats[s_minimum]); fprintf(stdout, "firstquartile_time: %g\n", stats[s_firstquartile]); fprintf(stdout, "median_time: %g\n", stats[s_median]); fprintf(stdout, "thirdquartile_time: %g\n", stats[s_thirdquartile]); fprintf(stdout, "max_time: %g\n", stats[s_maximum]); fprintf(stdout, "mean_time: %g\n", stats[s_mean]); fprintf(stdout, "stddev_time: %g\n", stats[s_std]); get_statistics(edge_counts, num_bfs_roots, stats); fprintf(stdout, "min_nedge: %.11g\n", stats[s_minimum]); fprintf(stdout, "firstquartile_nedge: %.11g\n", stats[s_firstquartile]); fprintf(stdout, "median_nedge: %.11g\n", stats[s_median]); fprintf(stdout, "thirdquartile_nedge: %.11g\n", stats[s_thirdquartile]); fprintf(stdout, "max_nedge: %.11g\n", stats[s_maximum]); fprintf(stdout, "mean_nedge: %.11g\n", stats[s_mean]); fprintf(stdout, "stddev_nedge: %.11g\n", stats[s_std]); double* secs_per_edge = (double*)xmalloc(num_bfs_roots * sizeof(double)); for (i = 0; i < num_bfs_roots; ++i) secs_per_edge[i] = bfs_times[i] / edge_counts[i]; get_statistics(secs_per_edge, num_bfs_roots, stats); fprintf(stdout, "min_TEPS: %g\n", 1. / stats[s_maximum]); fprintf(stdout, "firstquartile_TEPS: %g\n", 1. / stats[s_thirdquartile]); fprintf(stdout, "median_TEPS: %g\n", 1. / stats[s_median]); fprintf(stdout, "thirdquartile_TEPS: %g\n", 1. / stats[s_firstquartile]); fprintf(stdout, "max_TEPS: %g\n", 1. / stats[s_minimum]); fprintf(stdout, "harmonic_mean_TEPS: %g\n", 1. / stats[s_mean]); /* Formula from: * Title: The Standard Errors of the Geometric and Harmonic Means and * Their Application to Index Numbers * Author(s): Nilan Norris * Source: The Annals of Mathematical Statistics, Vol. 11, No. 4 (Dec., 1940), pp. 445-448 * Publisher(s): Institute of Mathematical Statistics * Stable URL: http://www.jstor.org/stable/2235723 * (same source as in specification). */ fprintf(stdout, "harmonic_stddev_TEPS: %g\n", stats[s_std] / (stats[s_mean] * stats[s_mean] * sqrt(num_bfs_roots - 1))); free(secs_per_edge); secs_per_edge = NULL; free(edge_counts); edge_counts = NULL; get_statistics(validate_times, num_bfs_roots, stats); fprintf(stdout, "min_validate: %g\n", stats[s_minimum]); fprintf(stdout, "firstquartile_validate: %g\n", stats[s_firstquartile]); fprintf(stdout, "median_validate: %g\n", stats[s_median]); fprintf(stdout, "thirdquartile_validate: %g\n", stats[s_thirdquartile]); fprintf(stdout, "max_validate: %g\n", stats[s_maximum]); fprintf(stdout, "mean_validate: %g\n", stats[s_mean]); fprintf(stdout, "stddev_validate: %g\n", stats[s_std]); #if 0 for (i = 0; i < num_bfs_roots; ++i) { fprintf(stdout, "Run %3d: %g s, validation %g s\n", i + 1, bfs_times[i], validate_times[i]); } #endif } } free(bfs_times); free(validate_times); #endif cleanup_globals(); MPI_Finalize(); return 0; }
FCSResult ifcs_p2nfft_init( void **rd, MPI_Comm comm ) { const char *fnc_name = "ifcs_p2nfft_init"; ifcs_p2nfft_data_struct *d; /* return error if method context is already allocated */ if (*rd != NULL) return fcs_result_create(FCS_ERROR_LOGICAL_ERROR, fnc_name, "Multiple init of method context without finalize."); /* Initialize the PNFFT library */ FCS_PNFFT(init)(); /* Create data structure */ d = mkplan_p2nfft(); /* return error if allocation failed */ if (d == NULL) return fcs_result_create(FCS_ERROR_ALLOC_FAILED, fnc_name, "Allocation of the p2nfft data structure failed."); #if FCS_P2NFFT_USE_3D_PROCMESH /* Create a three-dimensional cartesian comm from the given (possibly non-cartesian) one. */ if( !comm_is_cart_3d(comm) ) comm_create_cart_3d(comm, &d->cart_comm_3d, d->np); else { int periods[3], coords[3]; MPI_Cart_get(comm, 3, d->np, periods, coords); if( periods[0] && periods[1] && periods[2] ) MPI_Comm_dup(comm, &d->cart_comm_3d); else { for(int t=0; t<3; t++) periods[t] = 1; MPI_Cart_create(comm, 3, d->np, periods, 0, &d->cart_comm_3d); } } MPI_Comm_dup(d->cart_comm_3d, &d->cart_comm_pnfft); #else /* create 2d cart procmesh for PNFFT and its 3d counterpart */ comm_create_cart_2d(comm, &d->cart_comm_pnfft, &d->cart_comm_3d, d->np); #endif /* Set the default values */ d->needs_retune = 1; d->tune_alpha = 1; d->tune_r_cut = 1; d->tune_epsI = 1; d->tune_epsB = 1; d->tune_k_cut = 1; d->tune_N = 1; d->tune_n = 1; d->tune_m = 1; d->tune_p = 1; d->tune_b = 1; d->tune_c = 1; #if FCS_ENABLE_INFO d->flags = FCS_P2NFFT_VERBOSE_TUNING; #else d->flags = 0; #endif d->pnfft_flags = PNFFT_MALLOC_F_HAT| PNFFT_PRE_PHI_HAT | PNFFT_FFT_OUT_OF_PLACE | PNFFT_TRANSPOSED_F_HAT; d->pnfft_interpolation_order = 3; d->pnfft_window = FCS_P2NFFT_DEFAULT_PNFFT_WINDOW; d->pnfft_direct = 0; d->pfft_flags = PFFT_NO_TUNE | PFFT_DESTROY_INPUT; d->pfft_patience = FCS_P2NFFT_DEFAULT_PFFT_PATIENCE; /* We do not know the default tolerance type at this point, since periodicity * may be changed via fcs_set_periodicity after fcs_init. */ d->tolerance_type = FCS_TOLERANCE_TYPE_UNDEFINED; d->tolerance = -1.0; d->N[0] = d->N[1] = d->N[2] = 16; d->m = 4; d->p = 8; d->c = 0.0; d->b[0] = d->b[1] = d->b[2] = 0.0; /* init to same nonsense on all processes */ d->alpha = -1.0; d->r_cut = -1.0; d->one_over_r_cut = -1.0; d->epsI = -1.0; d->epsB = -1.0; d->k_cut = -1.0; d->num_nodes = -1; d->sum_qpart = -1; d->sum_q2 = -1.0; d->sum_q = 0.0; d->bg_charge = 0.0; d->box_V = 0.0; for(int t=0; t<3; t++){ d->box_l[t] = -1.0; d->box_expand[t] = 1.0; d->box_scales[t] = 1.0; d->box_a[t] = 0.0; d->box_b[t] = 0.0; d->box_c[t] = 0.0; } for(int t=0; t<9; t++) d->box_inv[t] = 0.0; comm_get_periodicity(comm, d->periodicity); d->short_range_flag = -1; d->reg_near = FCS_P2NFFT_REG_NEAR_DEFAULT; d->reg_far = FCS_P2NFFT_REG_FAR_DEFAULT; d->reg_kernel = FCS_P2NFFT_REG_KERNEL_DEFAULT; /* init local data distribution of PNFFT: * local_N, local_N_start, lower_border, upper_border */ for(int t=0; t<3; t++){ d->local_N[t] = -1; d->local_N_start[t] = -1; d->lower_border[t] = -1; d->upper_border[t] = -1; } d->regkern_hat = NULL; /* init gridsort data */ d->max_particle_move = -1; d->resort = d->local_num_particles = 0; d->gridsort_resort = FCS_GRIDSORT_RESORT_NULL; d->gridsort_cache = FCS_GRIDSORT_CACHE_NULL; *rd = d; return NULL; }
void initcomm(int ndx,int ndy,int ndz) { int i,j,k,tmp; int ipd[3],idm[3],ir; MPI_Comm icomm; if(ndx*ndy*ndz != npe){ if(id==0){ printf("Invalid number of PE\n"); printf("Please check partitioning pattern or number of PE\n"); } MPI_Finalize(); exit(0); } icomm= MPI_COMM_WORLD; idm[0]= ndx; idm[1]= ndy; idm[2]= ndz; ipd[0]= 0; ipd[1]= 0; ipd[2]= 0; ir= 0; MPI_Cart_create(icomm, ndims, idm, ipd, ir, &mpi_comm_cart); MPI_Cart_get(mpi_comm_cart, ndims, idm, ipd, iop); if(ndz > 1){ MPI_Cart_shift(mpi_comm_cart, 2, 1, &npz[0], &npz[1]); } if(ndy > 1){ MPI_Cart_shift(mpi_comm_cart, 1, 1, &npy[0], &npy[1]); } if(ndx > 1){ MPI_Cart_shift(mpi_comm_cart, 0, 1, &npx[0], &npx[1]); } }
/* Check that the MPI implementation properly handles zero-dimensional Cartesian communicators - the original standard implies that these should be consistent with higher dimensional topologies and thus these should work with any MPI implementation. MPI 2.1 made this requirement explicit. */ int main(int argc, char *argv[]) { int errs = 0; int size, rank, ndims; MPI_Comm comm, newcomm; MTest_Init(&argc, &argv); /* Create a new cartesian communicator in a subset of the processes */ MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (size < 2) { fprintf(stderr, "This test needs at least 2 processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Cart_create(MPI_COMM_WORLD, 0, NULL, NULL, 0, &comm); if (comm != MPI_COMM_NULL) { int csize; MPI_Comm_size(comm, &csize); if (csize != 1) { errs++; fprintf(stderr, "Sizes is wrong in cart communicator. Is %d, should be 1\n", csize); } /* This function is not meaningful, but should not fail */ MPI_Dims_create(1, 0, NULL); ndims = -1; MPI_Cartdim_get(comm, &ndims); if (ndims != 0) { errs++; fprintf(stderr, "MPI_Cartdim_get: ndims is %d, should be 0\n", ndims); } /* this function should not fail */ MPI_Cart_get(comm, 0, NULL, NULL, NULL); MPI_Cart_rank(comm, NULL, &rank); if (rank != 0) { errs++; fprintf(stderr, "MPI_Cart_rank: rank is %d, should be 0\n", rank); } /* this function should not fail */ MPI_Cart_coords(comm, 0, 0, NULL); MPI_Cart_sub(comm, NULL, &newcomm); ndims = -1; MPI_Cartdim_get(newcomm, &ndims); if (ndims != 0) { errs++; fprintf(stderr, "MPI_Cart_sub did not return zero-dimensional communicator\n"); } MPI_Barrier(comm); MPI_Comm_free(&comm); MPI_Comm_free(&newcomm); } else if (rank == 0) { errs++; fprintf(stderr, "Communicator returned is null!"); } MTest_Finalize(errs); MPI_Finalize(); return 0; }
void Communication::prepare(p3m_float box_l[3]) { P3M_DEBUG(printf( " P3M::Communication::prepare() started...\n")); /* Test whether the communicator is cartesian and correct dimensionality */ bool comm_is_cart = false; int status; MPI_Topo_test(mpicomm_orig, &status); if (status == MPI_CART) { /* Communicator is cartesian, so test dimensionality */ int ndims; MPI_Cartdim_get(mpicomm_orig, &ndims); if (ndims == 3) { /* Correct dimensionality, so get grid and test periodicity */ int periodicity[3]; MPI_Cart_get(mpicomm_orig, 3, node_grid, periodicity, node_pos); if (periodicity[0] && periodicity[1] && periodicity[2]) { /* If periodicity is correct, we can just use this communicator */ mpicomm = mpicomm_orig; /* get the rank */ MPI_Comm_rank(mpicomm, &rank); comm_is_cart = true; } } } /* otherwise, we have to set up the cartesian communicator */ if (!comm_is_cart) { P3M_DEBUG(printf( " Setting up cartesian communicator...\n")); node_grid[0] = 0; node_grid[1] = 0; node_grid[2] = 0; /* compute node grid */ MPI_Dims_create(size, 3, node_grid); #ifdef P3M_ENABLE_INFO if (onMaster()) printf(" node_grid=%dx%dx%d\n", node_grid[0], node_grid[1], node_grid[2]); #endif /* create communicator */ int periodicity[3] = {1, 1, 1}; MPI_Cart_create(mpicomm_orig, 3, node_grid, periodicity, 1, &mpicomm); /* get the rank */ MPI_Comm_rank(mpicomm, &rank); /* get node pos */ MPI_Cart_coords(mpicomm, rank, 3, node_pos); } /* fetch neighborhood info */ for (int dir = 0; dir < 3; dir++) { MPI_Cart_shift(mpicomm, dir, 1, &node_neighbors[2*dir], &node_neighbors[2*dir+1]); P3M_DEBUG_LOCAL(printf( " %d: dir=%d: n1=%d n2=%d\n", rank, dir, \ node_neighbors[2*dir], \ node_neighbors[2*dir+1])); } /* init local points */ for (int i=0; i< 3; i++) { local_box_l[i] = 0.0; my_left[i] = 0.0; my_right[i] = 0.0; } /* compute box limits */ for(p3m_int i = 0; i < 3; i++) { local_box_l[i] = box_l[i]/(p3m_float)node_grid[i]; my_left[i] = node_pos[i] *local_box_l[i]; my_right[i] = (node_pos[i]+1)*local_box_l[i]; } P3M_DEBUG(printf(" local_box_l=" F3FLOAT "\n" \ " my_left=" F3FLOAT "\n" \ " my_right=" F3FLOAT "\n", \ local_box_l[0], \ local_box_l[1], \ local_box_l[2], \ my_left[0], my_left[1], my_left[2], \ my_right[0], my_right[1], my_right[2] \ )); P3M_DEBUG(printf(" P3M::Communication::prepare() finished.\n")); }
/** * Creates a 3D single precision R2C parallel FFT plan. If data_out point to the same location as the input * data, then an inplace plan will be created. Otherwise the plan would be outplace. * @param n Integer array of size 3, corresponding to the global data size * @param data Input data in spatial domain * @param data_out Output data in frequency domain * @param c_comm Cartesian communicator returned by \ref accfft_create_comm * @param flags AccFFT flags, See \ref flags for more details. * @return */ accfft_plan_gpuf* accfft_plan_dft_3d_r2c_gpuf(int * n, float * data_d,float * data_out_d, MPI_Comm c_comm,unsigned flags){ accfft_plan_gpuf *plan=new accfft_plan_gpuf; int procid; MPI_Comm_rank(c_comm, &procid); plan->procid=procid; MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord); plan->c_comm=c_comm; int *coord=plan->coord; MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm); MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm); plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2]; plan->data=data_d; plan->data_out=data_out_d; if(plan->np[1]==1) plan->oneD=true; else plan->oneD=false; if(data_out_d==data_d){ plan->inplace=true;} else{plan->inplace=false;} int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; int alloc_max=0; int n_tuples_i, n_tuples_o; //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2; plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2: n_tuples_i=n[2]; n_tuples_o=(n[2]/2+1)*2; int isize[3],osize[3],istart[3],ostart[3]; alloc_max=accfft_local_size_dft_r2c_gpuf(n,isize,istart,osize,ostart,c_comm,plan->inplace); plan->alloc_max=alloc_max; dfft_get_local_size_gpuf(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm); dfft_get_local_size_gpuf(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm); dfft_get_local_size_gpuf(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm); std::swap(osize_1[1],osize_1[2]); std::swap(ostart_1[1],ostart_1[2]); std::swap(ostart_2[1],ostart_2[2]); std::swap(ostart_2[0],ostart_2[1]); std::swap(osize_2[1],osize_2[2]); std::swap(osize_2[0],osize_2[1]); for(int i=0;i<3;i++){ osize_1i[i]=osize_1[i]; osize_2i[i]=osize_2[i]; ostart_1i[i]=ostart_1[i]; ostart_2i[i]=ostart_2[i]; } // fplan_0 int NX=n[0], NY=n[1], NZ=n[2]; cufftResult_t cufft_error; { int f_inembed[1]={n_tuples_i}; int f_onembed[1]={n_tuples_o/2}; int idist=(n_tuples_i); int odist=n_tuples_o/2; int istride=1; int ostride=1; int batch=osize_0[0]*osize_0[1];//NX; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_R2C, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } if(batch!=0) { cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2], f_onembed, ostride, odist, // *onembed, ostride, odist f_inembed, istride, idist, // *inembed, istride, idist CUFFT_C2R, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_1 { int f_inembed[1]={NY}; int f_onembed[1]={NY}; int idist=1; int odist=1; int istride=osize_1[2]; int ostride=osize_1[2]; int batch=osize_1[2]; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_C2C, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_2 { int f_inembed[1]={NX}; int f_onembed[1]={NX}; int idist=1; int odist=1; int istride=osize_2[1]*osize_2[2]; int ostride=osize_2[1]*osize_2[2]; int batch=osize_2[1]*osize_2[2];; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_C2C, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // 1D Decomposition if(plan->oneD){ int N0=n[0], N1=n[1], N2=n[2]; plan->Mem_mgr = new Mem_Mgr_gpu<float>(N0,N1,n_tuples_o,c_comm); plan->T_plan_2 = new T_Plan_gpu <float>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm); plan->T_plan_2i= new T_Plan_gpu <float>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm); plan->T_plan_1=NULL; plan->T_plan_1i=NULL; plan->alloc_max=alloc_max; plan->T_plan_2->alloc_local=alloc_max; plan->T_plan_2i->alloc_local=alloc_max; if(flags==ACCFFT_MEASURE){ plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d); } else{ plan->T_plan_2->method=2; plan->T_plan_2->kway=2; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_2->method =plan->T_plan_2->method; plan->T_plan_2i->method=plan->T_plan_2->method; plan->T_plan_2->kway =plan->T_plan_2->kway; plan->T_plan_2i->kway=plan->T_plan_2->kway; } // 2D Decomposition if (!plan->oneD){ // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers plan->Mem_mgr =new Mem_Mgr_gpu<float>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max); plan->T_plan_1 = new T_Plan_gpu<float>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]); plan->T_plan_2 = new T_Plan_gpu<float>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm); plan->T_plan_2i= new T_Plan_gpu<float>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm); plan->T_plan_1i= new T_Plan_gpu<float>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]); plan->T_plan_1->alloc_local=plan->alloc_max; plan->T_plan_2->alloc_local=plan->alloc_max; plan->T_plan_2i->alloc_local=plan->alloc_max; plan->T_plan_1i->alloc_local=plan->alloc_max; if(flags==ACCFFT_MEASURE){ if(coord[0]==0){ plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]); } } else{ plan->T_plan_1->method=2; plan->T_plan_1->kway=2; } MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm ); checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_1->method =plan->T_plan_1->method; plan->T_plan_2->method =plan->T_plan_1->method; plan->T_plan_2i->method=plan->T_plan_1->method; plan->T_plan_1i->method=plan->T_plan_1->method; plan->T_plan_1->kway =plan->T_plan_1->kway; plan->T_plan_2->kway =plan->T_plan_1->kway; plan->T_plan_2i->kway=plan->T_plan_1->kway; plan->T_plan_1i->kway=plan->T_plan_1->kway; plan->iplan_1=-1; plan->iplan_2=-1; } plan->r2c_plan_baked=true; return plan; }
void fcs_memd_setup_communicator(memd_struct* memd, MPI_Comm communicator) { /* store given communicator */ memd->mpiparams.original_comm = communicator; MPI_Comm_size(communicator, &memd->mpiparams.size); /* Test whether the communicator is cartesian and correct dimensionality */ int comm_is_cart = 0; int status; MPI_Topo_test(communicator, &status); if (status == MPI_CART) { /* Communicator is cartesian, so test dimensionality */ int ndims; MPI_Cartdim_get(communicator, &ndims); if (ndims == 3) { /* Correct dimensionality, so get grid and test periodicity */ int periodicity[3]; MPI_Cart_get(communicator, 3, memd->mpiparams.node_grid, periodicity, memd->mpiparams.node_pos); if (periodicity[0] && periodicity[1] && periodicity[2]) { /* If periodicity is correct, we can just use this communicator */ memd->mpiparams.communicator = communicator; /* get the rank */ MPI_Comm_rank(communicator, &memd->mpiparams.this_node); comm_is_cart = 1; } } } /* otherwise, we have to set up the cartesian communicator */ if (!comm_is_cart) { memd->mpiparams.node_grid[0] = 0.0; memd->mpiparams.node_grid[1] = 0.0; memd->mpiparams.node_grid[2] = 0.0; /* compute node grid */ MPI_Dims_create(memd->mpiparams.size, 3, memd->mpiparams.node_grid); /* swap first and last dimension, as MEMD currently wants to have them increasing */ fcs_int tmp = memd->mpiparams.node_grid[2]; memd->mpiparams.node_grid[2] = memd->mpiparams.node_grid[0]; memd->mpiparams.node_grid[0] = tmp; /* create communicator */ int periodicity[3] = {1, 1, 1}; MPI_Cart_create(memd->mpiparams.original_comm, 3, memd->mpiparams.node_grid, periodicity, 1, &memd->mpiparams.communicator); /* get the rank */ MPI_Comm_rank(communicator, &memd->mpiparams.this_node); /* get node pos */ MPI_Cart_coords(memd->mpiparams.communicator, memd->mpiparams.this_node, 3, memd->mpiparams.node_pos); } /* fetch neighborhood info */ for (int dir = 0; dir<3; dir++) { MPI_Cart_shift(memd->mpiparams.communicator, dir, 1, &memd->mpiparams.node_neighbors[2*dir], &memd->mpiparams.node_neighbors[2*dir+1]); } /* init local points */ for (int i=0; i< 3; i++) { // memd->mpiparams.local_box_l[i] = 0.0; memd->mpiparams.my_left[i] = 0.0; memd->mpiparams.my_right[i] = 0.0; } /* compute box limits */ fcs_float local_box_length = 0.0; for(fcs_int i = 0; i < 3; i++) { local_box_length = memd->parameters.box_length[i] / (fcs_float)memd->mpiparams.node_grid[i]; memd->mpiparams.my_left[i] = memd->mpiparams.node_pos[i] * local_box_length; memd->mpiparams.my_right[i] = (memd->mpiparams.node_pos[i]+1) * local_box_length; } }
int Comm::setup(MMD_float cutneigh, Atom &atom) { int i; int nprocs; int periods[3]; MMD_float prd[3]; int myloc[3]; MPI_Comm cartesian; MMD_float lo, hi; int ineed, idim, nbox; prd[0] = atom.box.xprd; prd[1] = atom.box.yprd; prd[2] = atom.box.zprd; /* setup 3-d grid of procs */ MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MMD_float area[3]; area[0] = prd[0] * prd[1]; area[1] = prd[0] * prd[2]; area[2] = prd[1] * prd[2]; MMD_float bestsurf = 2.0 * (area[0] + area[1] + area[2]); // loop thru all possible factorizations of nprocs // surf = surface area of a proc sub-domain // for 2d, insure ipz = 1 int ipx, ipy, ipz, nremain; MMD_float surf; ipx = 1; while(ipx <= nprocs) { if(nprocs % ipx == 0) { nremain = nprocs / ipx; ipy = 1; while(ipy <= nremain) { if(nremain % ipy == 0) { ipz = nremain / ipy; surf = area[0] / ipx / ipy + area[1] / ipx / ipz + area[2] / ipy / ipz; if(surf < bestsurf) { bestsurf = surf; procgrid[0] = ipx; procgrid[1] = ipy; procgrid[2] = ipz; } } ipy++; } } ipx++; } if(procgrid[0]*procgrid[1]*procgrid[2] != nprocs) { if(me == 0) printf("ERROR: Bad grid of processors\n"); return 1; } /* determine where I am and my neighboring procs in 3d grid of procs */ int reorder = 0; periods[0] = periods[1] = periods[2] = 1; MPI_Cart_create(MPI_COMM_WORLD, 3, procgrid, periods, reorder, &cartesian); MPI_Cart_get(cartesian, 3, procgrid, periods, myloc); MPI_Cart_shift(cartesian, 0, 1, &procneigh[0][0], &procneigh[0][1]); MPI_Cart_shift(cartesian, 1, 1, &procneigh[1][0], &procneigh[1][1]); MPI_Cart_shift(cartesian, 2, 1, &procneigh[2][0], &procneigh[2][1]); /* lo/hi = my local box bounds */ atom.box.xlo = myloc[0] * prd[0] / procgrid[0]; atom.box.xhi = (myloc[0] + 1) * prd[0] / procgrid[0]; atom.box.ylo = myloc[1] * prd[1] / procgrid[1]; atom.box.yhi = (myloc[1] + 1) * prd[1] / procgrid[1]; atom.box.zlo = myloc[2] * prd[2] / procgrid[2]; atom.box.zhi = (myloc[2] + 1) * prd[2] / procgrid[2]; /* need = # of boxes I need atoms from in each dimension */ need[0] = static_cast<int>(cutneigh * procgrid[0] / prd[0] + 1); need[1] = static_cast<int>(cutneigh * procgrid[1] / prd[1] + 1); need[2] = static_cast<int>(cutneigh * procgrid[2] / prd[2] + 1); /* alloc comm memory */ int maxswap = 2 * (need[0] + need[1] + need[2]); slablo = (MMD_float*) malloc(maxswap * sizeof(MMD_float)); slabhi = (MMD_float*) malloc(maxswap * sizeof(MMD_float)); pbc_any = (int*) malloc(maxswap * sizeof(int)); pbc_flagx = (int*) malloc(maxswap * sizeof(int)); pbc_flagy = (int*) malloc(maxswap * sizeof(int)); pbc_flagz = (int*) malloc(maxswap * sizeof(int)); sendproc = (int*) malloc(maxswap * sizeof(int)); recvproc = (int*) malloc(maxswap * sizeof(int)); sendproc_exc = (int*) malloc(maxswap * sizeof(int)); recvproc_exc = (int*) malloc(maxswap * sizeof(int)); sendnum = (int*) malloc(maxswap * sizeof(int)); recvnum = (int*) malloc(maxswap * sizeof(int)); comm_send_size = (int*) malloc(maxswap * sizeof(int)); comm_recv_size = (int*) malloc(maxswap * sizeof(int)); reverse_send_size = (int*) malloc(maxswap * sizeof(int)); reverse_recv_size = (int*) malloc(maxswap * sizeof(int)); int iswap = 0; for(int idim = 0; idim < 3; idim++) for(int i = 1; i <= need[idim]; i++, iswap += 2) { MPI_Cart_shift(cartesian, idim, i, &sendproc_exc[iswap], &sendproc_exc[iswap + 1]); MPI_Cart_shift(cartesian, idim, i, &recvproc_exc[iswap + 1], &recvproc_exc[iswap]); } MPI_Comm_free(&cartesian); firstrecv = (int*) malloc(maxswap * sizeof(int)); maxsendlist = (int*) malloc(maxswap * sizeof(int)); for(i = 0; i < maxswap; i++) maxsendlist[i] = BUFMIN; sendlist = (int**) malloc(maxswap * sizeof(int*)); for(i = 0; i < maxswap; i++) sendlist[i] = (int*) malloc(BUFMIN * sizeof(int)); /* setup 4 parameters for each exchange: (spart,rpart,slablo,slabhi) sendproc(nswap) = proc to send to at each swap recvproc(nswap) = proc to recv from at each swap slablo/slabhi(nswap) = slab boundaries (in correct dimension) of atoms to send at each swap 1st part of if statement is sending to the west/south/down 2nd part of if statement is sending to the east/north/up nbox = atoms I send originated in this box */ /* set commflag if atoms are being exchanged across a box boundary commflag(idim,nswap) = 0 -> not across a boundary = 1 -> add box-length to position when sending = -1 -> subtract box-length from pos when sending */ nswap = 0; for(idim = 0; idim < 3; idim++) { for(ineed = 0; ineed < 2 * need[idim]; ineed++) { pbc_any[nswap] = 0; pbc_flagx[nswap] = 0; pbc_flagy[nswap] = 0; pbc_flagz[nswap] = 0; if(ineed % 2 == 0) { sendproc[nswap] = procneigh[idim][0]; recvproc[nswap] = procneigh[idim][1]; nbox = myloc[idim] + ineed / 2; lo = nbox * prd[idim] / procgrid[idim]; if(idim == 0) hi = atom.box.xlo + cutneigh; if(idim == 1) hi = atom.box.ylo + cutneigh; if(idim == 2) hi = atom.box.zlo + cutneigh; hi = MIN(hi, (nbox + 1) * prd[idim] / procgrid[idim]); if(myloc[idim] == 0) { pbc_any[nswap] = 1; if(idim == 0) pbc_flagx[nswap] = 1; if(idim == 1) pbc_flagy[nswap] = 1; if(idim == 2) pbc_flagz[nswap] = 1; } } else { sendproc[nswap] = procneigh[idim][1]; recvproc[nswap] = procneigh[idim][0]; nbox = myloc[idim] - ineed / 2; hi = (nbox + 1) * prd[idim] / procgrid[idim]; if(idim == 0) lo = atom.box.xhi - cutneigh; if(idim == 1) lo = atom.box.yhi - cutneigh; if(idim == 2) lo = atom.box.zhi - cutneigh; lo = MAX(lo, nbox * prd[idim] / procgrid[idim]); if(myloc[idim] == procgrid[idim] - 1) { pbc_any[nswap] = 1; if(idim == 0) pbc_flagx[nswap] = -1; if(idim == 1) pbc_flagy[nswap] = -1; if(idim == 2) pbc_flagz[nswap] = -1; } } slablo[nswap] = lo; slabhi[nswap] = hi; nswap++; } } return 0; }
int main(int argc, char ** argv){ MPI_Init(&argc, &argv); //get command line arguments //CHANGE TO MATCH ./life in.file out.file eventually if(argc != 3){ printf("USAGE: lifegrid m n\n"); MPI_Finalize(); exit(EXIT_FAILURE); } sscanf(argv[1], "%d", &m); sscanf(argv[2], "%d", &n); //in_file = (char*) malloc(strlen(argv[3]) * sizeof(char)); //out_file = (char*) malloc(strlen(argv[4]) * sizeof(char)); MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); comm_grid_dims[0] = comm_grid_dims[1] = sqrt(world_size); int periods[] = {0,0}; int reorder = 0; MPI_Cart_create( MPI_COMM_WORLD, DIMENSIONS, comm_grid_dims, periods, reorder, &grid_comm); int my_coords[2]; MPI_Cart_get( grid_comm, DIMENSIONS, comm_grid_dims, periods, my_coords ); //initialize a grid with some crap in it //over commit memory, provide a padding of one row/column extra //around the outside of the array current_generation_grid = (int*) calloc((m+2) * (n+2) , sizeof(int)); int i; int write_loc = n + 1; //skip to end of first row for(i=0; i < m*n; i++){ if(i % n == 0){ write_loc += 2; } current_generation_grid[write_loc++] = i + world_rank * 100; } //REPLACE WITH FILE READ EVENTUALLY next_generation_grid = (int*) calloc((m+2) * (n+2), sizeof(int)); create_up_communication(); create_down_communication(); init_column_t(); create_left_communication(); create_right_communication(); int current_generation; for(current_generation = 0; current_generation<generations; current_generation++){ simulate_generation(); } //the game.. is over MPI_Datatype final_grid_t; MPI_Type_vector( m, n, n+2, MPI_INT, &final_grid_t ); MPI_Type_commit(&final_grid_t); if(world_rank == 0){ int ** result_grid = (int**) malloc(world_size * sizeof(int*)); MPI_Request result_reqs[world_size-1]; int z; for(z=1; z<world_size; z++){ result_grid[z] = (int*) malloc(m * n * sizeof(int)); MPI_Irecv( result_grid[z], m * n, MPI_INT, z, TAG, grid_comm, &result_reqs[z-1] ); } result_grid[0] = (int*) malloc(m * n * sizeof(int)); for(z=0; z<m; z++){ memcpy(&result_grid[0][n * z], &grid[(n + 3) + z * (n + 2)], n * sizeof(int)); } MPI_Waitall(world_size-1, result_reqs, MPI_STATUSES_IGNORE); printf("FINAL BOARD:\n\n"); for(z=0; z<dims[0]; z++){ int y; int row; for(row = 0; row < m; row++){ for(y=0; y<dims[1]; y++){ int x; for(x=0; x<n; x++){ printf("%4d ", result_grid[z * dims[1] + y][row * n + x]); } } printf("\n"); } } } else{ MPI_Request result_req; MPI_Isend( &grid[n+3], 1, final_grid_t, 0, TAG, grid_comm, &result_req ); MPI_Wait(&result_req, MPI_STATUSES_IGNORE); } MPI_Finalize(); return 0; }
/** * Creates a 3D C2C parallel FFT plan. If data_out point to the same location as the input * data, then an inplace plan will be created. Otherwise the plan would be outplace. * @param n Integer array of size 3, corresponding to the global data size * @param data Input data in spatial domain * @param data_out Output data in frequency domain * @param c_comm Cartesian communicator returned by \ref accfft_create_comm * @param flags AccFFT flags, See \ref flags for more details. * @return */ accfft_plan_gpu* accfft_plan_dft_3d_c2c_gpu(int * n, Complex * data_d, Complex * data_out_d, MPI_Comm c_comm,unsigned flags){ accfft_plan_gpu *plan=new accfft_plan_gpu; int nprocs, procid; MPI_Comm_rank(c_comm, &procid); plan->procid=procid; MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord); plan->c_comm=c_comm; int *coord=plan->coord; MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm); MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm); plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2]; int NX=n[0], NY=n[1], NZ=n[2]; cufftResult_t cufft_error; plan->data_c=data_d; plan->data_out_c=data_out_d; if(data_out_d==data_d){ plan->inplace=true;} else{plan->inplace=false;} if(plan->np[1]==1) plan->oneD=true; else plan->oneD=false; int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0; int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1; int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2; int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i; int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i; int alloc_local; int alloc_max=0,n_tuples=n[2]*2; //int isize[3],osize[3],istart[3],ostart[3]; alloc_max=accfft_local_size_dft_c2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm); plan->alloc_max=alloc_max; dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm); dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm); dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm); std::swap(osize_1[1],osize_1[2]); std::swap(ostart_1[1],ostart_1[2]); std::swap(ostart_2[1],ostart_2[2]); std::swap(ostart_2[0],ostart_2[1]); std::swap(osize_2[1],osize_2[2]); std::swap(osize_2[0],osize_2[1]); for(int i=0;i<3;i++){ osize_1i[i]=osize_1[i]; osize_2i[i]=osize_2[i]; ostart_1i[i]=ostart_1[i]; ostart_2i[i]=ostart_2[i]; } // fplan_0 { int f_inembed[1]={NZ}; int f_onembed[1]={NZ}; int idist=(NZ); int odist=(NZ); int istride=1; int ostride=1; int batch=osize_0[0]*osize_0[1];//NX; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_1 { int f_inembed[1]={NY}; int f_onembed[1]={NY}; int idist=1; int odist=1; int istride=osize_1[2]; int ostride=osize_1[2]; int batch=osize_1[2]; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // fplan_2 { int f_inembed[1]={NX}; int f_onembed[1]={NX}; int idist=1; int odist=1; int istride=osize_2[1]*osize_2[2]; int ostride=osize_2[1]*osize_2[2]; int batch=osize_2[1]*osize_2[2];; if(batch!=0) { cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0], f_inembed, istride, idist, // *inembed, istride, idist f_onembed, ostride, odist, // *onembed, ostride, odist CUFFT_Z2Z, batch); if(cufft_error!= CUFFT_SUCCESS) { fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL; } //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;} } } // 1D Decomposition if (plan->oneD){ int NX=n[0],NY=n[1],NZ=n[2]; plan->alloc_max=alloc_max; plan->Mem_mgr= new Mem_Mgr_gpu <double>(NX,NY,(NZ)*2,c_comm); plan->T_plan_2= new T_Plan_gpu <double>(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm); plan->T_plan_2i= new T_Plan_gpu<double>(NY,NX,NZ*2, plan->Mem_mgr,c_comm); plan->T_plan_2->alloc_local=alloc_max; plan->T_plan_2i->alloc_local=alloc_max; plan->T_plan_1=NULL; plan->T_plan_1i=NULL; if(flags==ACCFFT_MEASURE){ plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,(double*)data_out_d); } else{ plan->T_plan_2->method=2; plan->T_plan_2->kway=2; } checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_2i->method=-plan->T_plan_2->method; plan->T_plan_2i->kway=plan->T_plan_2->kway; plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async; }// end 1d c2c // 2D Decomposition if (!plan->oneD){ // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers plan->Mem_mgr= new Mem_Mgr_gpu<double>(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max); plan->T_plan_1= new T_Plan_gpu<double>(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]); plan->T_plan_2= new T_Plan_gpu<double>(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm); plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm); plan->T_plan_1i= new T_Plan_gpu<double>(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]); plan->T_plan_1->alloc_local=plan->alloc_max; plan->T_plan_2->alloc_local=plan->alloc_max; plan->T_plan_2i->alloc_local=plan->alloc_max; plan->T_plan_1i->alloc_local=plan->alloc_max; plan->iplan_0=NULL; plan->iplan_1=NULL; plan->iplan_2=NULL; int coords[2],np[2],periods[2]; MPI_Cart_get(c_comm,2,np,periods,coords); if(flags==ACCFFT_MEASURE){ if(coords[0]==0){ plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d,osize_0[0]); } } else{ plan->T_plan_1->method=2; plan->T_plan_1->kway=2; } MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm ); MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm ); checkCuda_accfft (cudaDeviceSynchronize()); MPI_Barrier(plan->c_comm); plan->T_plan_1->method =plan->T_plan_1->method; plan->T_plan_2->method =plan->T_plan_1->method; plan->T_plan_2i->method=-plan->T_plan_1->method; plan->T_plan_1i->method=-plan->T_plan_1->method; plan->T_plan_1->kway =plan->T_plan_1->kway; plan->T_plan_2->kway =plan->T_plan_1->kway; plan->T_plan_2i->kway=plan->T_plan_1->kway; plan->T_plan_1i->kway=plan->T_plan_1->kway; plan->T_plan_1->kway_async =plan->T_plan_1->kway_async; plan->T_plan_2->kway_async =plan->T_plan_1->kway_async; plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async; plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async; }// end 2d c2c plan->c2c_plan_baked=true; return plan; }// end accfft_plan_dft_c2c_gpu