int main(int argc, char* argv[]) { MPI_Init(&argc,&argv); MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &MPI_COMM_NODE); int n = (argc>1) ? atoi(argv[1]) : 1000; int wrank, wsize; MPI_Comm_rank(MPI_COMM_WORLD, &wrank); MPI_Comm_size(MPI_COMM_WORLD, &wsize); int nrank, nsize; MPI_Comm_rank(MPI_COMM_WORLD, &nrank); MPI_Comm_size(MPI_COMM_WORLD, &nsize); char * buf1 = NULL; char * buf2 = NULL; MPI_Alloc_mem(n, MPI_INFO_NULL, &buf1); MPI_Alloc_mem(n, MPI_INFO_NULL, &buf2); memset(buf1, nrank==0 ? 'Z' : 'A', n); memset(buf2, nrank==0 ? 'Z' : 'A', n); double t0, t1, dt; for (int r=0; r<20; r++) { MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); MPI_Bcast(buf1, n, MPI_CHAR, 0, MPI_COMM_NODE); t1 = MPI_Wtime(); dt = t1-t0; printf("%d: MPI_Bcast: %lf seconds, %lf MB/s \n", wrank, dt, n*(1.e-6/dt)); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); t0 = MPI_Wtime(); SMP_Bcast(buf2, n, MPI_CHAR, 0, MPI_COMM_NODE); t1 = MPI_Wtime(); dt = t1-t0; printf("%d: SMP_Bcast: %lf seconds, %lf MB/s \n", wrank, dt, n*(1.e-6/dt)); fflush(stdout); if (r==0) { char * tmp = malloc(n); memset(tmp, 'Z', n); int err1 = memcmp(tmp, buf1, n); int err2 = memcmp(tmp, buf2, n); if (err1>0 || err2>0) { printf("%d: errors: MPI (%d), SMP (%d) \n", wrank, err1, err2); } } } MPI_Free_mem(buf1); MPI_Free_mem(buf2); MPI_Comm_free(&MPI_COMM_NODE); MPI_Finalize(); return 0; }
int MashmIntraNodeCommInit(MashmIntraNodeComm* intraComm, MPI_Comm in_comm) { int ierr; /* Store the parent commincator */ intraComm->parentComm = in_comm; /* Create the shared memory subcommunicator groups */ ierr = MPI_Comm_split_type(intraComm->parentComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(intraComm->comm)); /* Determine the number of nodal comm groups */ ierr = MPI_Comm_size(intraComm->comm, &(intraComm->size)); ierr = MPI_Comm_rank(intraComm->comm, &(intraComm->rank)); ierr = MPI_Comm_rank(intraComm->parentComm, &(intraComm->parentRank)); /* Subcommunicator rank == 0 is the master process */ if (intraComm->rank == 0) { intraComm->isMasterProc = true; } else { intraComm->isMasterProc = false; } intraComm->parentRanksOnNode = (int *) malloc(sizeof(int)*intraComm->size); ierr = MPI_Allgather(&(intraComm->parentRank), 1, MPI_INT, intraComm->parentRanksOnNode, 1, MPI_INT, intraComm->comm); return 0; }
void MemoryUsageReporter::sharedMemoryRanksBySplitCommunicator() { // figure out which ranks share memory processor_id_type world_rank = 0; #ifdef LIBMESH_HAVE_MPI // create a split communicator among shared memory ranks MPI_Comm shmem_raw_comm; MPI_Comm_split_type( _mur_communicator.get(), MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmem_raw_comm); Parallel::Communicator shmem_comm(shmem_raw_comm); // broadcast the world rank of the sub group root world_rank = _my_rank; shmem_comm.broadcast(world_rank, 0); #endif std::vector<processor_id_type> world_ranks(_nrank); _mur_communicator.gather(0, world_rank, world_ranks); // assign a contiguous unique numerical id to each shared memory group on processor zero unsigned int id = 0; processor_id_type last = world_ranks[0]; if (_my_rank == 0) for (std::size_t i = 0; i < _nrank; ++i) { if (world_ranks[i] != last) { last = world_ranks[i]; id++; } _hardware_id[i] = id; } }
int main(int argc, char* argv[]) { MPI_Init(&argc,&argv); MPI_Aint bytes = (argc>1) ? atol(argv[1]) : 128*1024*1024; printf("bytes = %zu\n", bytes); MPI_Comm comm_shared = MPI_COMM_NULL; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &comm_shared); MPI_Info info_win = MPI_INFO_NULL; MPI_Info_create(&info_win); MPI_Info_set(info_win, "alloc_shared_noncontig", "true"); MPI_Win win_shared = MPI_WIN_NULL; void * base_ptr = NULL; int rc = MPI_Win_allocate_shared(bytes, 1 /* disp_unit */, info_win, comm_shared, &base_ptr, &win_shared); memset(base_ptr,255,bytes); MPI_Info_free(&info_win); MPI_Comm_free(&comm_shared); MPI_Finalize(); return 0; }
int main(int argc, char* argv[]) { MPI_Comm comm, newcomm, scomm; MPI_Group group; MPI_Info newinfo; int rank, size, color; int errs = 0, errclass, mpi_errno; MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_dup(MPI_COMM_WORLD, &comm); MPI_Comm_group(comm, &group); MPI_Comm_create(comm, group, &newcomm); color = rank % 2; MPI_Comm_split(MPI_COMM_WORLD, color, rank, &scomm); MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN); /*test comm_split_type for NULL variable */ mpi_errno = MPI_Comm_split_type(scomm, 2, 4, newinfo, NULL); MPI_Error_class(mpi_errno, &errclass); if (errclass != MPI_ERR_ARG) ++errs; MPI_Comm_free(&comm); MPI_Comm_free(&newcomm); MPI_Comm_free(&scomm); MPI_Group_free(&group); MTest_Finalize(errs); MPI_Finalize(); return 0; }
void selectDeviceBasedOnMpiRank() { #ifdef WALBERLA_BUILD_WITH_MPI int deviceCount; WALBERLA_CUDA_CHECK( cudaGetDeviceCount( &deviceCount )); WALBERLA_LOG_INFO_ON_ROOT( "Selecting CUDA device depending on MPI Rank" ); MPI_Info info; MPI_Info_create( &info ); MPI_Comm newCommunicator; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator ); int processesOnNode; int rankOnNode; MPI_Comm_size( newCommunicator, &processesOnNode ); MPI_Comm_rank( newCommunicator, &rankOnNode ); if ( deviceCount == processesOnNode ) { WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode )); } else if ( deviceCount > processesOnNode ) { WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node " << processesOnNode << " available GPUs on node " << deviceCount ); WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode )); } else { WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node " << processesOnNode << ", available GPUs on node " << deviceCount ); WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount )); } #endif }
int main(int argc, char *argv[]) { int requested=MPI_THREAD_MULTIPLE, provided; MPI_Init_thread(&argc, &argv, requested, &provided); if (requested<provided) MPI_Abort(MPI_COMM_WORLD, 1); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); int sum = 0; MPI_Allreduce(&rank, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (sum != (size*(size-1)/2) ) MPI_Abort(MPI_COMM_WORLD,2); MPI_Comm nodecomm = MPI_COMM_NULL; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &nodecomm); int bytes = 1024; MPI_Win nodewin = MPI_WIN_NULL; void * nodeptr = NULL; MPI_Win_allocate_shared((MPI_Aint)bytes, 1, MPI_INFO_NULL, nodecomm, &nodeptr, &nodewin); int noderank; MPI_Comm_rank(nodecomm, &noderank); if (noderank==0) memset(nodeptr,1,(size_t)bytes); MPI_Win_free(&nodewin); MPI_Comm_free(&nodecomm); if (rank==0) printf("Success\n"); MPI_Finalize(); return 0; }
mpi_shmem ( MPI_Comm comm=MPI_COMM_WORLD ) : comm_( comm ) { // Split the provided communicator into groups that share // memory (are on the same node). if ( MPI_Comm_split_type( comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm_ ) ) throw std::runtime_error( "Failed to split communicator by node." ); if ( MPI_Comm_size( shmcomm_, &ntasks_ ) ) throw std::runtime_error( "Failed to get node communicator size" ); if ( MPI_Comm_rank( shmcomm_, &rank_ ) ) throw std::runtime_error( "Failed to get node communicator rank" ); }
int main(int argc, char * argv[]) { MPI_Init(&argc, &argv); int wrank, wsize; MPI_Comm_rank(MPI_COMM_WORLD, &wrank); MPI_Comm_size(MPI_COMM_WORLD, &wsize); int nrank, nsize; MPI_Comm MPI_COMM_NODE; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_NODE); MPI_Comm_rank(MPI_COMM_NODE, &nrank); MPI_Comm_size(MPI_COMM_NODE, &nsize); int * shptr = NULL; MPI_Win shwin; MPI_Info win_info; MPI_Info_create(&win_info); MPI_Info_set(win_info, "alloc_shared_noncontig", "true"); MPI_Win_allocate_shared(sizeof(int), sizeof(int), win_info, MPI_COMM_NODE, &shptr, &shwin); MPI_Info_free(&win_info); MPI_Win_lock_all(0 /* assertion */, shwin); MPI_Win_sync(shwin); MPI_Barrier(MPI_COMM_NODE); MPI_Aint rsize[nsize]; int rdisp[nsize]; int * rptr[nsize]; for (int i=0; i<nsize; i++) { MPI_Win_shared_query(shwin, i, &(rsize[i]), &(rdisp[i]), &(rptr[i])); printf("rank=%d target=%d rptr=%p rsize=%zu rdisp=%d \n", nrank, i, rptr[i], (size_t)rsize[i], rdisp[i]); } MPI_Win_unlock_all(shwin); MPI_Win_free(&shwin); MPI_Comm_free(&MPI_COMM_NODE); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int r,p; int n, energy, niters, px, py; int rx, ry; int north, south, west, east; int bx, by, offx, offy; /* three heat sources */ const int nsources = 3; int sources[nsources][2]; int locnsources; /* number of sources in my area */ int locsources[nsources][2]; /* sources local to my rank */ double t1, t2; int iter, i, j; double heat, rheat; int final_flag; /* initialize MPI envrionment */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &r); MPI_Comm_size(MPI_COMM_WORLD, &p); /* create shared memory communicator */ MPI_Comm shmcomm; MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); int sr, sp; // rank and size in shmem comm MPI_Comm_size(shmcomm, &sp); MPI_Comm_rank(shmcomm, &sr); // this code works only on comm world! if(sp != p) MPI_Abort(MPI_COMM_WORLD, 1); /* argument checking and setting */ setup(r, p, argc, argv, &n, &energy, &niters, &px, &py, &final_flag); if (final_flag == 1) { MPI_Finalize(); exit(0); } /* determine my coordinates (x,y) -- r=x*a+y in the 2d processor array */ rx = r % px; ry = r / px; /* determine my four neighbors */ north = (ry - 1) * px + rx; if (ry-1 < 0) north = MPI_PROC_NULL; south = (ry + 1) * px + rx; if (ry+1 >= py) south = MPI_PROC_NULL; west = ry * px + rx - 1; if (rx-1 < 0) west = MPI_PROC_NULL; east = ry * px + rx + 1; if (rx+1 >= px) east = MPI_PROC_NULL; /* decompose the domain */ bx = n / px; /* block size in x */ by = n / py; /* block size in y */ offx = rx * bx; /* offset in x */ offy = ry * by; /* offset in y */ /* printf("%i (%i,%i) - w: %i, e: %i, n: %i, s: %i\n", r, ry,rx,west,east,north,south); */ int size = (bx+2)*(by+2); /* process-local grid (including halos (thus +2)) */ double *mem; MPI_Win win; MPI_Win_allocate_shared(2*size*sizeof(double), 1, MPI_INFO_NULL, shmcomm, &mem, &win); double *tmp; double *anew=mem; /* each rank's offset */ double *aold=mem+size; /* second half is aold! */ double *northptr, *southptr, *eastptr, *westptr; double *northptr2, *southptr2, *eastptr2, *westptr2; MPI_Aint sz; int dsp_unit; /* locate the shared memory region for each neighbor */ MPI_Win_shared_query(win, north, &sz, &dsp_unit, &northptr); MPI_Win_shared_query(win, south, &sz, &dsp_unit, &southptr); MPI_Win_shared_query(win, east, &sz, &dsp_unit, &eastptr); MPI_Win_shared_query(win, west, &sz, &dsp_unit, &westptr); northptr2 = northptr+size; southptr2 = southptr+size; eastptr2 = eastptr+size; westptr2 = westptr+size; /* initialize three heat sources */ init_sources(bx, by, offx, offy, n, nsources, sources, &locnsources, locsources); t1 = MPI_Wtime(); /* take time */ MPI_Win_lock_all(0, win); for (iter = 0; iter < niters; ++iter) { /* refresh heat sources */ for (i = 0; i < locnsources; ++i) { aold[ind(locsources[i][0],locsources[i][1])] += energy; /* heat source */ } MPI_Win_sync(win); MPI_Barrier(shmcomm); /* exchange data with neighbors */ if(north != MPI_PROC_NULL) { for(i=0; i<bx; ++i) aold[ind(i+1,0)] = northptr2[ind(i+1,by)]; /* pack loop - last valid region */ } if(south != MPI_PROC_NULL) { for(i=0; i<bx; ++i) aold[ind(i+1,by+1)] = southptr2[ind(i+1,1)]; /* pack loop */ } if(east != MPI_PROC_NULL) { for(i=0; i<by; ++i) aold[ind(bx+1,i+1)] = eastptr2[ind(1,i+1)]; /* pack loop */ } if(west != MPI_PROC_NULL) { for(i=0; i<by; ++i) aold[ind(0,i+1)] = westptr2[ind(bx,i+1)]; /* pack loop */ } /* update grid points */ update_grid(bx, by, aold, anew, &heat); /* swap working arrays */ tmp = anew; anew = aold; aold = tmp; /* optional - print image */ if (iter == niters-1) printarr_par(iter, anew, n, px, py, rx, ry, bx, by, offx, offy, shmcomm); } MPI_Win_unlock_all(win); t2 = MPI_Wtime(); /* get final heat in the system */ MPI_Allreduce(&heat, &rheat, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if (!r) printf("[%i] last heat: %f time: %f\n", r, rheat, t2-t1); /* free working arrays and communication buffers */ MPI_Win_free(&win); MPI_Comm_free(&shmcomm); MPI_Finalize(); }
FORT_DLL_SPEC void FORT_CALL mpi_comm_split_type_ ( MPI_Fint *v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *ierr ){ *ierr = MPI_Comm_split_type( (MPI_Comm)(*v1), (int)*v2, (int)*v3, (MPI_Info)(*v4), (MPI_Comm *)(v5) ); }
int main(int argc, char *argv[]) { int rank, nproc, i, x; int errors = 0, all_errors = 0; MPI_Win win = MPI_WIN_NULL; MPI_Comm shm_comm = MPI_COMM_NULL; int shm_nproc, shm_rank; double **shm_bases = NULL, *my_base; MPI_Win shm_win = MPI_WIN_NULL; MPI_Group shm_group = MPI_GROUP_NULL, world_group = MPI_GROUP_NULL; int *shm_ranks = NULL, *shm_ranks_in_world = NULL; MPI_Aint get_target_base_offsets = 0; int win_size = sizeof(double) * BUF_CNT; int new_win_size = win_size; int win_unit = sizeof(double); int shm_root_rank_in_world; int origin = -1, put_target, get_target; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_group(MPI_COMM_WORLD, &world_group); if (nproc != 4) { if (rank == 0) printf("Error: must be run with four processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_rank); MPI_Comm_size(shm_comm, &shm_nproc); MPI_Comm_group(shm_comm, &shm_group); /* Platform does not support shared memory or wrong host file, just return. */ if (shm_nproc != 2) { goto exit; } shm_bases = (double **) calloc(shm_nproc, sizeof(double *)); shm_ranks = (int *) calloc(shm_nproc, sizeof(int)); shm_ranks_in_world = (int *) calloc(shm_nproc, sizeof(int)); if (shm_rank == 0) shm_root_rank_in_world = rank; MPI_Bcast(&shm_root_rank_in_world, 1, MPI_INT, 0, shm_comm); /* Identify ranks of target processes which are located on node 0 */ if (rank == 0) { for (i = 0; i < shm_nproc; i++) { shm_ranks[i] = i; } MPI_Group_translate_ranks(shm_group, shm_nproc, shm_ranks, world_group, shm_ranks_in_world); } MPI_Bcast(shm_ranks_in_world, shm_nproc, MPI_INT, 0, MPI_COMM_WORLD); put_target = shm_ranks_in_world[shm_nproc - 1]; get_target = shm_ranks_in_world[0]; /* Identify the rank of origin process which are located on node 1 */ if (shm_root_rank_in_world == 1 && shm_rank == 0) { origin = rank; if (verbose) { printf("---- I am origin = %d, get_target = %d, put_target = %d\n", origin, get_target, put_target); } } /* Allocate shared memory among local processes */ MPI_Win_allocate_shared(win_size, win_unit, MPI_INFO_NULL, shm_comm, &my_base, &shm_win); if (shm_root_rank_in_world == 0 && verbose) { MPI_Aint size; int disp_unit; for (i = 0; i < shm_nproc; i++) { MPI_Win_shared_query(shm_win, i, &size, &disp_unit, &shm_bases[i]); printf("%d -- shared query: base[%d]=%p, size %zd, " "unit %d\n", rank, i, shm_bases[i], size, disp_unit); } } /* Get offset of put target(1) on get target(0) */ get_target_base_offsets = (shm_nproc - 1) * win_size / win_unit; if (origin == rank && verbose) printf("%d -- base_offset of put_target %d on get_target %d: %zd\n", rank, put_target, get_target, get_target_base_offsets); /* Create using MPI_Win_create(). Note that new window size of get_target(0) * is equal to the total size of shm segments on this node, thus get_target * process can read the byte located on put_target process.*/ for (i = 0; i < BUF_CNT; i++) { local_buf[i] = (i + 1) * 1.0; my_base[i] = 0.0; } if (get_target == rank) new_win_size = win_size * shm_nproc; MPI_Win_create(my_base, new_win_size, win_unit, MPI_INFO_NULL, MPI_COMM_WORLD, &win); if (verbose) printf("%d -- new window my_base %p, size %d\n", rank, my_base, new_win_size); MPI_Barrier(MPI_COMM_WORLD); /* Check if flush guarantees the completion of put operations on target side. * * P exclusively locks 2 processes whose windows are shared with each other. * P first put and flush to a process, then get the updated data from another process. * If flush returns before operations are done on the target side, the data may be * incorrect.*/ for (x = 0; x < ITER; x++) { for (i = 0; i < BUF_CNT; i++) { local_buf[i] += x; check_buf[i] = 0; } if (rank == origin) { MPI_Win_lock(MPI_LOCK_EXCLUSIVE, put_target, 0, win); MPI_Win_lock(MPI_LOCK_EXCLUSIVE, get_target, 0, win); for (i = 0; i < BUF_CNT; i++) { MPI_Put(&local_buf[i], 1, MPI_DOUBLE, put_target, i, 1, MPI_DOUBLE, win); } MPI_Win_flush(put_target, win); MPI_Get(check_buf, BUF_CNT, MPI_DOUBLE, get_target, get_target_base_offsets, BUF_CNT, MPI_DOUBLE, win); MPI_Win_flush(get_target, win); for (i = 0; i < BUF_CNT; i++) { if (check_buf[i] != local_buf[i]) { printf("%d(iter %d) - Got check_buf[%d] = %.1lf, expected %.1lf\n", rank, x, i, check_buf[i], local_buf[i]); errors++; } } MPI_Win_unlock(put_target, win); MPI_Win_unlock(get_target, win); } } MPI_Barrier(MPI_COMM_WORLD); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); exit: if (rank == 0 && all_errors == 0) printf(" No Errors\n"); if (shm_bases) free(shm_bases); if (shm_ranks) free(shm_ranks); if (shm_ranks_in_world) free(shm_ranks_in_world); if (shm_win != MPI_WIN_NULL) MPI_Win_free(&shm_win); if (win != MPI_WIN_NULL) MPI_Win_free(&win); if (shm_comm != MPI_COMM_NULL) MPI_Comm_free(&shm_comm); if (shm_group != MPI_GROUP_NULL) MPI_Group_free(&shm_group); if (world_group != MPI_GROUP_NULL) MPI_Group_free(&world_group); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { MPI_Info info_in, info_out; int errors = 0, all_errors = 0; MPI_Win win; void *base; char invalid_key[] = "invalid_test_key"; char buf[MPI_MAX_INFO_VAL]; int flag; MPI_Comm shm_comm = MPI_COMM_NULL; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); /* Test#1: setting a valid key at window-create time */ MPI_Info_create(&info_in); MPI_Info_set(info_in, "no_locks", "true"); MPI_Win_allocate(sizeof(int), sizeof(int), info_in, MPI_COMM_WORLD, &base, &win); errors += check_win_info_get(win, "no_locks", "true"); MPI_Info_free(&info_in); /* We create a new window with no info argument for the next text to ensure that we have the * default settings */ MPI_Win_free(&win); MPI_Win_allocate(sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &base, &win); /* Test#2: setting and getting invalid key */ win_info_set(win, invalid_key, "true"); MPI_Win_get_info(win, &info_out); MPI_Info_get(info_out, invalid_key, MPI_MAX_INFO_VAL, buf, &flag); #ifndef USE_STRICT_MPI /* Check if our invalid key was ignored. Note, this check's MPICH's * behavior, but this behavior may not be required for a standard * conforming MPI implementation. */ if (flag) { printf("%d: %s was not ignored\n", rank, invalid_key); errors++; } #endif MPI_Info_free(&info_out); /* Test#3: setting info key "no_lock" (no default value) */ win_info_set(win, "no_locks", "false"); errors += check_win_info_get(win, "no_locks", "false"); win_info_set(win, "no_locks", "true"); errors += check_win_info_get(win, "no_locks", "true"); /* Test#4: getting/setting "accumulate_ordering" */ /* #4.1: is the default "rar,raw,war,waw" as stated in the standard? */ errors += check_win_info_get(win, "accumulate_ordering", "rar,raw,war,waw"); /* #4.2: setting "accumulate_ordering" to "none" */ win_info_set(win, "accumulate_ordering", "none"); errors += check_win_info_get(win, "accumulate_ordering", "none"); /* #4.3: setting "accumulate_ordering" to "rar,waw" */ win_info_set(win, "accumulate_ordering", "rar,waw"); errors += check_win_info_get(win, "accumulate_ordering", "rar,waw"); /* Test#5: getting/setting "accumulate_ops" */ /* #5.1: is the default "same_op_no_op" as stated in the standard? */ errors += check_win_info_get(win, "accumulate_ops", "same_op_no_op"); /* #5.2: setting "accumulate_ops" to "same_op" */ win_info_set(win, "accumulate_ops", "same_op"); errors += check_win_info_get(win, "accumulate_ops", "same_op"); /* Test#6: setting "same_size" (no default value) */ win_info_set(win, "same_size", "false"); errors += check_win_info_get(win, "same_size", "false"); win_info_set(win, "same_size", "true"); errors += check_win_info_get(win, "same_size", "true"); /* Test#7: setting "same_disp_unit" (no default value) */ win_info_set(win, "same_disp_unit", "false"); errors += check_win_info_get(win, "same_disp_unit", "false"); win_info_set(win, "same_disp_unit", "true"); errors += check_win_info_get(win, "same_disp_unit", "true"); /* TODO: check alloc_shm as implementation-specific test */ /* Test#8: setting "alloc_shared_noncontig" (no default value) in shared window. */ MPI_Win_free(&win); /* #8.1: setting at window allocation */ MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm); MPI_Info_create(&info_in); MPI_Info_set(info_in, "alloc_shared_noncontig", "true"); MPI_Win_allocate_shared(sizeof(int), sizeof(int), info_in, shm_comm, &base, &win); errors += check_win_info_get(win, "alloc_shared_noncontig", "true"); MPI_Info_free(&info_in); /* #8.2: setting info */ win_info_set(win, "alloc_shared_noncontig", "false"); errors += check_win_info_get(win, "alloc_shared_noncontig", "false"); MPI_Comm_free(&shm_comm); MPI_Win_free(&win); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && all_errors == 0) printf(" No Errors\n"); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, rank, nproc; int shm_rank, shm_nproc; MPI_Aint size; int errors = 0, all_errors = 0; int *base, *my_base; int disp_unit; MPI_Win shm_win; MPI_Comm shm_comm; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_rank); MPI_Comm_size(shm_comm, &shm_nproc); /* Allocate ELEM_PER_PROC integers for each process */ MPI_Win_allocate_shared(sizeof(int)*ELEM_PER_PROC, sizeof(int), MPI_INFO_NULL, shm_comm, &my_base, &shm_win); /* Locate absolute base */ MPI_Win_shared_query(shm_win, MPI_PROC_NULL, &size, &disp_unit, &base); /* make sure the query returned the right values */ if (disp_unit != sizeof(int)) errors++; if (size != ELEM_PER_PROC * sizeof(int)) errors++; if ((shm_rank == 0) && (base != my_base)) errors++; if (shm_rank && (base == my_base)) errors++; if (verbose) printf("%d -- size = %d baseptr = %p my_baseptr = %p\n", shm_rank, (int) size, (void*) base, (void*) my_base); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win); /* Write to all my data */ for (i = 0; i < ELEM_PER_PROC; i++) { my_base[i] = i; } MPI_Win_sync(shm_win); MPI_Barrier(shm_comm); MPI_Win_sync(shm_win); /* Read and verify everyone's data */ for (i = 0; i < shm_nproc; i++) { for (j = 0; j < ELEM_PER_PROC; j++) { if ( base[i*ELEM_PER_PROC + j] != j ) { errors++; printf("%d -- Got %d at rank %d index %d, expected %d\n", shm_rank, base[i*ELEM_PER_PROC + j], i, j, j); } } } MPI_Win_unlock_all(shm_win); MPI_Win_free(&shm_win); MPI_Comm_free(&shm_comm); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && all_errors == 0) printf(" No Errors\n"); MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, rank, nproc; int shm_rank, shm_nproc; MPI_Aint size; int errors = 0, all_errors = 0; int **bases = NULL, *my_base = NULL; int disp_unit; MPI_Win shm_win = MPI_WIN_NULL, win = MPI_WIN_NULL; MPI_Comm shm_comm = MPI_COMM_NULL; MPI_Group shm_group = MPI_GROUP_NULL, world_group = MPI_GROUP_NULL; int dst_shm_rank, dst_world_rank; MPI_Info create_info = MPI_INFO_NULL; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_rank); MPI_Comm_size(shm_comm, &shm_nproc); /* Platform does not support shared memory, just return. */ if (shm_nproc < 2) { goto exit; } /* Specify the last process in the node as the target process */ dst_shm_rank = shm_nproc - 1; MPI_Comm_group(shm_comm, &shm_group); MPI_Comm_group(MPI_COMM_WORLD, &world_group); MPI_Group_translate_ranks(shm_group, 1, &dst_shm_rank, world_group, &dst_world_rank); bases = calloc(shm_nproc, sizeof(int *)); /* Allocate shm window among local processes, then create a global window with * those shm window buffers */ MPI_Win_allocate_shared(sizeof(int) * ELEM_PER_PROC, sizeof(int), MPI_INFO_NULL, shm_comm, &my_base, &shm_win); if (verbose) printf("%d -- allocate shared: my_base = %p, absolute base\n", shm_rank, my_base); for (i = 0; i < shm_nproc; i++) { MPI_Win_shared_query(shm_win, i, &size, &disp_unit, &bases[i]); if (verbose) printf("%d -- shared query: base[%d]=%p, size %ld, unit %d\n", shm_rank, i, bases[i], size, disp_unit); } #ifdef USE_INFO_ALLOC_SHM MPI_Info_create(&create_info); MPI_Info_set(create_info, "alloc_shm", "true"); #else create_info = MPI_INFO_NULL; #endif MPI_Win_create(my_base, sizeof(int) * ELEM_PER_PROC, sizeof(int), create_info, MPI_COMM_WORLD, &win); /* Reset data */ for (i = 0; i < ELEM_PER_PROC; i++) { my_base[i] = 0; local_buf[i] = i + 1; } /* Do RMA through global window, then check value through shared window */ MPI_Win_lock_all(0, win); MPI_Win_lock_all(0, shm_win); if (shm_rank == 0) { MPI_Put(&local_buf[0], 1, MPI_INT, dst_world_rank, 0, 1, MPI_INT, win); MPI_Put(&local_buf[ELEM_PER_PROC - 1], 1, MPI_INT, dst_world_rank, ELEM_PER_PROC - 1, 1, MPI_INT, win); MPI_Win_flush(dst_world_rank, win); } MPI_Win_sync(shm_win); MPI_Barrier(shm_comm); MPI_Win_sync(shm_win); if (bases[dst_shm_rank][0] != local_buf[0]) { errors++; printf("%d -- Got %d at rank %d index %d, expected %d\n", rank, bases[dst_shm_rank][0], dst_shm_rank, 0, local_buf[0]); } if (bases[dst_shm_rank][ELEM_PER_PROC - 1] != local_buf[ELEM_PER_PROC - 1]) { errors++; printf("%d -- Got %d at rank %d index %d, expected %d\n", rank, bases[dst_shm_rank][ELEM_PER_PROC - 1], dst_shm_rank, ELEM_PER_PROC - 1, local_buf[ELEM_PER_PROC - 1]); } MPI_Win_unlock_all(shm_win); MPI_Win_unlock_all(win); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Win_free(&win); MPI_Win_free(&shm_win); exit: if (rank == 0 && all_errors == 0) printf(" No Errors\n"); if (create_info != MPI_INFO_NULL) MPI_Info_free(&create_info); if (shm_comm != MPI_COMM_NULL) MPI_Comm_free(&shm_comm); if (shm_group != MPI_GROUP_NULL) MPI_Group_free(&shm_group); if (world_group != MPI_GROUP_NULL) MPI_Group_free(&world_group); MPI_Finalize(); if (bases) free(bases); return 0; }
/** * TODO: Differentiate units belonging to team_id and that not * belonging to team_id * within the function or outside it? * FIX: Outside it. * * The teamid stands for a superteam related to the new generated * newteam. */ dart_ret_t dart_team_create( dart_team_t teamid, const dart_group_t* group, dart_team_t *newteam) { MPI_Comm comm; MPI_Comm subcomm; MPI_Win win; uint16_t index, unique_id; size_t size; dart_team_t max_teamid = -1; dart_unit_t sub_unit, unit; dart_myid (&unit); dart_size (&size); dart_team_myid (teamid, &sub_unit); int result = dart_adapt_teamlist_convert (teamid, &unique_id); if (result == -1) { return DART_ERR_INVAL; } comm = dart_teams[unique_id]; subcomm = MPI_COMM_NULL; MPI_Comm_create (comm, group -> mpi_group, &subcomm); *newteam = DART_TEAM_NULL; /* Get the maximum next_availteamid among all the units belonging to * the parent team specified by 'teamid'. */ MPI_Allreduce( &dart_next_availteamid, &max_teamid, 1, MPI_INT32_T, MPI_MAX, comm); dart_next_availteamid = max_teamid + 1; if (subcomm != MPI_COMM_NULL) { int result = dart_adapt_teamlist_alloc(max_teamid, &index); if (result == -1) { return DART_ERR_OTHER; } /* max_teamid is thought to be the new created team ID. */ *newteam = max_teamid; dart_teams[index] = subcomm; MPI_Win_create_dynamic(MPI_INFO_NULL, subcomm, &win); dart_win_lists[index] = win; } #if 0 /* Another way of generating the available teamID for the newly crated team. */ if (subcomm != MPI_COMM_NULL) { /* Get the maximum next_availteamid among all the units belonging to the * created sub-communicator. */ MPI_Allreduce (&next_availteamid, &max_teamid, 1, MPI_INT, MPI_MAX, subcomm); int result = dart_adapt_teamlist_alloc (max_teamid, &index); if (result == -1) { return DART_ERR_OTHER; } *newteam = max_teamid; teams[index] = subcomm; MPI_Comm_rank (subcomm, &rank); if (rank == 0) { root = sub_unit; if (sub_unit != 0) { MPI_Send (&root, 1, MPI_INT, 0, 0, comm); } } next_availteamid = max_teamid + 1; } if (sub_unit == 0) { if (root == -1) { MPI_Recv (&root, 1, MPI_INT, MPI_ANY_SOURCE, 0, comm, MPI_STATUS_IGNORE); } } MPI_Bcast (&root, 1, MPI_INT, 0, comm); /* Broadcast the calculated max_teamid to all the units not belonging to the * sub-communicator. */ MPI_Bcast (&max_teamid, 1, MPI_INT, root, comm); if (subcomm == MPI_COMM_NULL) { /* 'Next_availteamid' is changed iff it is smaller than 'max_teamid + 1' */ if (max_teamid + 1 > next_availteamid) { next_availteamid = max_teamid + 1; } } #endif if (subcomm != MPI_COMM_NULL) { #if !defined(DART_MPI_DISABLE_SHARED_WINDOWS) int i; size_t n; MPI_Comm sharedmem_comm; MPI_Group sharedmem_group, group_all; MPI_Comm_split_type( subcomm, MPI_COMM_TYPE_SHARED, 1, MPI_INFO_NULL, &sharedmem_comm); dart_sharedmem_comm_list[index] = sharedmem_comm; if (sharedmem_comm != MPI_COMM_NULL) { MPI_Comm_size( sharedmem_comm, &(dart_sharedmemnode_size[index])); // dart_unit_mapping[index] = (int*)malloc ( // dart_sharedmem_size[index] * sizeof (int)); MPI_Comm_group(sharedmem_comm, &sharedmem_group); MPI_Comm_group(MPI_COMM_WORLD, &group_all); int* dart_unit_mapping = (int *)malloc ( dart_sharedmemnode_size[index] * sizeof (int)); int* sharedmem_ranks = (int*)malloc ( dart_sharedmemnode_size[index] * sizeof (int)); dart_sharedmem_table[index] = (int*)malloc(size * sizeof(int)); for (i = 0; i < dart_sharedmemnode_size[index]; i++) { sharedmem_ranks[i] = i; } // MPI_Group_translate_ranks (sharedmem_group, dart_sharedmem_size[index], // sharedmem_ranks, group_all, dart_unit_mapping[index]); MPI_Group_translate_ranks( sharedmem_group, dart_sharedmemnode_size[index], sharedmem_ranks, group_all, dart_unit_mapping); for (n = 0; n < size; n++) { dart_sharedmem_table[index][n] = -1; } for (i = 0; i < dart_sharedmemnode_size[index]; i++) { dart_sharedmem_table[index][dart_unit_mapping[i]] = i; } free (sharedmem_ranks); free (dart_unit_mapping); } #endif MPI_Win_lock_all(0, win); DART_LOG_DEBUG ("%2d: TEAMCREATE - create team %d out of parent team %d", unit, *newteam, teamid); } return DART_OK; }
int main(int argc, char* argv[]) { MPI_Init(&argc, &argv); int nprocs, rank; MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm team_comm; MPI_Group team_grp; #ifdef MANUAL_SPLIT int team_size = nprocs/2; // There will be two teams. assert(team_size*2 == nprocs); MPI_Group grp; MPI_Comm_group(MPI_COMM_WORLD, &grp); int team_ranks[team_size]; if (rank < team_size) { // Team 0 for (int i = 0; i < team_size; ++i) { team_ranks[i] = i; } } else { // Team 1 for (int i = 0; i < team_size; ++i) { team_ranks[i] = i + team_size; } } MPI_Group_incl(grp, team_size, team_ranks, &team_grp); MPI_Comm_create(MPI_COMM_WORLD, team_grp, &team_comm); #else int r = MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &team_comm); if (r != MPI_SUCCESS) std::cout << "MPI_Comm_split_type failed" << std::endl; #endif int real_team_size; int real_team_rank; MPI_Comm_size(team_comm, &real_team_size); MPI_Comm_rank(team_comm, &real_team_rank); #if 0 for (int i = 0; i < nprocs; ++i) { if ( i == rank) { std::cout << "rank " << rank << ", team_comm size " << real_team_size << ", rank in team " << real_team_rank << std::endl; } MPI_Barrier(MPI_COMM_WORLD); } #endif const int N = 8; const int NN = (rank == 0) ? N : 0; MPI_Win win_shared; double* p; MPI_Win_allocate_shared(NN*sizeof(double), sizeof(double), MPI_INFO_NULL, team_comm, &p, &win_shared); if (rank != 0) { MPI_Aint sz; int disp; MPI_Win_shared_query(win_shared, MPI_PROC_NULL, &sz, &disp, &p); } std::atomic_thread_fence(std::memory_order_release); MPI_Barrier(team_comm); std::atomic_thread_fence(std::memory_order_acquire); if (rank == 1) { for (int i = 0; i < N; ++i) { p[i] = (double) (i*(rank+1)); } } std::atomic_thread_fence(std::memory_order_release); MPI_Barrier(team_comm); std::atomic_thread_fence(std::memory_order_acquire); for (int i = 0; i < nprocs; ++i) { if ( i == rank) { std::cout << "rank " << rank << ", data ="; for (int j = 0; j < N; ++j) { std::cout << " " << p[j]; } std::cout << std::endl; } MPI_Barrier(MPI_COMM_WORLD); } MPI_Win_free(&win_shared); #ifdef MANUAL_SPLIT MPI_Group_free(&team_grp); #endif MPI_Comm_free(&team_comm); MPI_Finalize(); }
int main(int argc, char **argv) { int rank, nranks, rank_world, nranks_world; int i, j, peer, bufsize, errors; double *win_buf, *src_buf, *dst_buf; MPI_Win buf_win; MPI_Comm shr_comm; MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank_world); MPI_Comm_size(MPI_COMM_WORLD, &nranks_world); MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shr_comm); MPI_Comm_rank(shr_comm, &rank); MPI_Comm_size(shr_comm, &nranks); bufsize = XDIM * YDIM * sizeof(double); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf); MPI_Win_allocate_shared(bufsize, 1, MPI_INFO_NULL, shr_comm, &win_buf, &buf_win); MPI_Win_fence(0, buf_win); for (i = 0; i < XDIM*YDIM; i++) { *(win_buf + i) = -1.0; *(src_buf + i) = 1.0 + rank; } MPI_Win_fence(0, buf_win); peer = (rank+1) % nranks; /* Perform ITERATIONS strided accumulate operations */ for (i = 0; i < ITERATIONS; i++) { int idx_rem[SUB_YDIM]; int blk_len[SUB_YDIM]; MPI_Datatype src_type, dst_type; for (j = 0; j < SUB_YDIM; j++) { idx_rem[j] = j*XDIM; blk_len[j] = SUB_XDIM; } MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type); MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); /* PUT */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win); MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0, 1, dst_type, MPI_REPLACE, buf_win); MPI_Win_unlock(peer, buf_win); /* GET */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win); MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0, 1, dst_type, MPI_NO_OP, buf_win); MPI_Win_unlock(peer, buf_win); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); } MPI_Barrier(MPI_COMM_WORLD); /* Verify that the results are correct */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win); errors = 0; for (i = 0; i < SUB_XDIM; i++) { for (j = 0; j < SUB_YDIM; j++) { const double actual = *(win_buf + i + j*XDIM); const double expected = (1.0 + ((rank+nranks-1)%nranks)); if (fabs(actual - expected) > 1.0e-10) { SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); ); errors++; fflush(stdout); } }
int main(int argc, char **argv) { int my_rank, shared_rank; void *mybase = NULL; MPI_Win win; MPI_Info win_info; MPI_Comm shared_comm; int i; int shm_win_size = 1024 * 1024 * 1024 * sizeof(char); /* 1GB */ MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); for (i = 0; i < 2; i++) { if (i == 0) { MPI_Info_create(&win_info); MPI_Info_set(win_info, (char *) "alloc_shm", (char *) "true"); } else { win_info = MPI_INFO_NULL; } MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, my_rank, MPI_INFO_NULL, &shared_comm); MPI_Comm_rank(shared_comm, &shared_rank); /* every processes allocate 1GB window memory */ MPI_Win_allocate(shm_win_size, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win); MPI_Win_free(&win); MPI_Win_allocate_shared(shm_win_size, sizeof(char), win_info, shared_comm, &mybase, &win); MPI_Win_free(&win); /* some processes allocate 1GB and some processes allocate zero bytes */ if (my_rank % 2 == 0) MPI_Win_allocate(shm_win_size, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win); else MPI_Win_allocate(0, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win); MPI_Win_free(&win); if (shared_rank % 2 == 0) MPI_Win_allocate_shared(shm_win_size, sizeof(char), win_info, shared_comm, &mybase, &win); else MPI_Win_allocate_shared(0, sizeof(char), win_info, shared_comm, &mybase, &win); MPI_Win_free(&win); /* some processes allocate 1GB and some processes allocate smaller bytes */ if (my_rank % 2 == 0) MPI_Win_allocate(shm_win_size, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win); else MPI_Win_allocate(shm_win_size / 2, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win); MPI_Win_free(&win); /* some processes allocate 1GB and some processes allocate smaller bytes */ if (shared_rank % 2 == 0) MPI_Win_allocate_shared(shm_win_size, sizeof(char), win_info, shared_comm, &mybase, &win); else MPI_Win_allocate_shared(shm_win_size / 2, sizeof(char), win_info, shared_comm, &mybase, &win); MPI_Win_free(&win); MPI_Comm_free(&shared_comm); if (i == 0) MPI_Info_free(&win_info); } MTest_Finalize(0); return 0; }
int main(int argc, char *argv[]){ MPI_Init(&argc, &argv); MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); double t_begin; int i, j, k, ii, id, id1; t_begin = MPI_Wtime(); // record the begining CPU time read_param(); lattice_vec(); allocate(); p_allocate(); if (flow_on!=0) { build_stream(); } if (Q_on!=0) { build_neighbor(); } init_surf(); add_patch(); p_init(); p_iden(); init(); if (flow_on!=0) cal_fequ(f); if (Q_on!=0) cal_dQ(); if (Q_on!=0 && flow_on!=0 && newrun_on!=0) { while(qconverge==0) { t_current++; for (ii=0; ii<n_evol_Q; ii++) { cal_dQ(); evol_Q(); } if (t_current%t_print==0) monitor(); } e_tot =-1; k_eng =-1; qconverge = 0; uconverge = 0; t_current =-1; } if (Q_on!=0 && flow_on!=0) { cal_W(); cal_stress(); cal_sigma_p(); } MPI_Barrier(MPI_COMM_WORLD); output1(1,'z',Nx/2,Ny/2); output3(1); if(myid==0) printf("Q initialized\n"); MPI_Barrier(MPI_COMM_WORLD); while (t_current<t_max && uconverge*qconverge==0) { e_toto=e_tot; if (Q_on!=0 && qconverge==0) { if (flow_on!=0 && uconverge==0) cal_W(); for (ii=0; ii<n_evol_Q; ii++) { cal_dQ(); evol_Q(); } } if (flow_on!=0 && uconverge==0) { if (Q_on!=0 && qconverge==0) { cal_stress(); cal_sigma_p(); } evol_f(f,f2); } if (Q_on!=0 && qconverge==0) { if (flow_on!=0 && uconverge==0) cal_W(); for (ii=0; ii<n_evol_Q; ii++) { cal_dQ(); evol_Q(); } } if (flow_on!=0 && uconverge==0) { if (Q_on!=0 && qconverge==0) { cal_stress(); cal_sigma_p(); } evol_f(f2,f); } if (t_current%t_print==0) monitor(); if (t_current%t_write==0) { output1(0,'z',Nx/2,Ny/2); output3(0); fflush(stdout); } t_current++; } output_time(t_begin); output1(0,'z',Nx/2,Ny/2); output3(0); write_restart(); p_deallocate(); deallocate(); return 0; }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int Num_groupsx, Num_groupsy; /* number of blocks in each coord direction */ int my_group; /* sequence number of shared memory block */ int my_group_IDx, my_group_IDy; /* coordinates of block within block grid */ int group_size; /* number of ranks in shared memory group */ int group_sizex, group_sizey; /* number of ranks in block in each coord direction */ int my_ID; /* MPI rank */ int my_global_IDx, my_global_IDy; /* coordinates of rank in overall rank grid */ int my_local_IDx, my_local_IDy; /* coordinates of rank within shared memory block */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ int local_nbr[4]; /* list of synchronizing local neighbors */ int num_local_nbrs; /* number of synchronizing local neighbors */ int dummy; DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; long n, width, height;/* linear global and block grid dimension */ int width_rank, height_rank; /* linear local dimension */ int iter, leftover; /* dummies */ int istart_rank, iend_rank; /* bounds of grid tile assigned to calling rank */ int jstart_rank, jend_rank; /* bounds of grid tile assigned to calling rank */ int istart, iend; /* bounds of grid block containing tile */ int jstart, jend; /* bounds of grid block containing tile */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; /* value to be matched by computed norm */ DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */ MPI_Win shm_win_in; /* shared memory window object for IN array */ MPI_Win shm_win_out; /* shared memory window object for OUT array */ MPI_Comm shm_comm_prep; /* preparatory shared memory communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of rankes in shared domain */ int shm_ID; /* MPI rank in shared memory domain */ MPI_Aint size_in; /* size of the IN array in shared memory window */ MPI_Aint size_out; /* size of the OUT array in shared memory window */ int size_mul; /* one for shm_comm root, zero for the other ranks */ int disp_unit; /* ignored */ /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM stencil execution on 2D grid\n"); #if !STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#ranks per coherence domain><# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 0){ printf("ERROR: iterations must be >= 0 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atol(*++argv); long nsquare = n * n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); MPI_Bcast(&n, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now. The decomposition needs to be such that shared memory groups can evenly tessellate the rank grid */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) { if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) { group_sizey=group_size/group_sizex; break; } } if (!(Num_procsy%group_sizey)) break; } } if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %ld\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Tiles per shared memory domain = %d\n", group_size); printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex, group_sizey); printf("Type of stencil = star\n"); #if LOCAL_BARRIER_SYNCH printf("Local synchronization = barrier\n"); #else printf("Local synchronization = point to point\n"); #endif #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that an SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); Num_groupsx = Num_procsx/group_sizex; Num_groupsy = Num_procsy/group_sizey; my_group = my_ID/group_size; my_group_IDx = my_group%Num_groupsx; my_group_IDy = my_group/Num_groupsx; my_local_IDx = my_ID%group_sizex; my_local_IDy = (my_ID%group_size)/group_sizex; my_global_IDx = my_group_IDx*group_sizex+my_local_IDx; my_global_IDy = my_group_IDy*group_sizey+my_local_IDy; /* set all neighboring ranks to -1 (no communication with those ranks) */ left_nbr = right_nbr = top_nbr = bottom_nbr = -1; /* keep track of local neighbors for local synchronization */ num_local_nbrs = 0; if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) { right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1; } if (my_local_IDx != group_sizex-1) { local_nbr[num_local_nbrs++] = shm_ID + 1; } if (my_local_IDx == 0 && my_group_IDx != 0) { left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1; } if (my_local_IDx != 0) { local_nbr[num_local_nbrs++] = shm_ID - 1; } if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) { top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx; } if (my_local_IDy != group_sizey-1) { local_nbr[num_local_nbrs++] = shm_ID + group_sizex; } if (my_local_IDy == 0 && my_group_IDy != 0) { bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx; } if (my_local_IDy != 0) { local_nbr[num_local_nbrs++] = shm_ID - group_sizex; } /* compute amount of space required for input and solution arrays for the block, and also compute index sets */ width = n/Num_groupsx; leftover = n%Num_groupsx; if (my_group_IDx<leftover) { istart = (width+1) * my_group_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_group_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_groupsy; leftover = n%Num_groupsy; if (my_group_IDy<leftover) { jstart = (height+1) * my_group_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_group_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n", my_ID, width, height); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); total_length_out = width*height*sizeof(DTYPE); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); size_in= total_length_in*size_mul; MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &in, &shm_win_in); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in); MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in); if (in == NULL){ printf("Error allocating space for input array by group %d\n",my_group); error = 1; } bail_out(error); size_out= total_length_out*size_mul; MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &out, &shm_win_out); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out); MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out); if (out == NULL){ printf("Error allocating space for output array by group %d\n", my_group); error = 1; } bail_out(error); /* determine index set assigned to each rank */ width_rank = width/group_sizex; leftover = width%group_sizex; if (my_local_IDx<leftover) { istart_rank = (width_rank+1) * my_local_IDx; iend_rank = istart_rank + width_rank; } else { istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover); iend_rank = istart_rank + width_rank - 1; } istart_rank += istart; iend_rank += istart; width_rank = iend_rank - istart_rank + 1; height_rank = height/group_sizey; leftover = height%group_sizey; if (my_local_IDy<leftover) { jstart_rank = (height_rank+1) * my_local_IDy; jend_rank = jstart_rank + height_rank; } else { jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover); jend_rank = jstart_rank + height_rank - 1; } jstart_rank+=jstart; jend_rank+=jstart; height_rank = jend_rank - jstart_rank + 1; if (height_rank*width_rank==0) { error = 1; printf("Rank %d has no work to do\n", my_ID); } bail_out(error); /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width_rank); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width_rank; bottom_buf_out = top_buf_out + 2*RADIUS*width_rank; bottom_buf_in = top_buf_out + 3*RADIUS*width_rank; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height_rank); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height_rank; left_buf_out = right_buf_out + 2*RADIUS*height_rank; left_buf_in = right_buf_out + 3*RADIUS*height_rank; /* fill the stencil weights to reflect a discrete divergence operator */ for (int jj=-RADIUS; jj<=RADIUS; jj++) for (int ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (int ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); MPI_Win_sync(shm_win_out); MPI_Barrier(shm_comm); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (top_nbr != -1) { MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (int kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (bottom_nbr != -1) { MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (int kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (top_nbr != -1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (int kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = top_buf_in[kk++]; } } if (bottom_nbr != -1) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* need to fetch ghost point data from neighbors in x-direction */ if (right_nbr != -1) { MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank-RADIUS+1; i<=iend_rank; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (left_nbr != -1) { MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=istart_rank+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (right_nbr != -1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank+1; i<=iend_rank+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (left_nbr != -1) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank-RADIUS; i<=istart_rank-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* Apply the stencil operator */ for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (int jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (int ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (int ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_out); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif /* add constant to solution to force refresh of neighbor data, if any */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0; /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif } /* end of iterations */ local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); MPI_Win_unlock_all(shm_win_in); MPI_Win_unlock_all(shm_win_out); MPI_Win_free(&shm_win_in); MPI_Win_free(&shm_win_out); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
int OSPU_Comm_split_node(MPI_Comm oldcomm, MPI_Comm * newcomm) { int rc = MPI_SUCCESS; #if MPI_VERSION >= 3 rc = MPI_Comm_split_type(oldcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, newcomm); if (rc!=MPI_SUCCESS) return rc; #elif defined(MPICH2) && (MPICH2_NUMVERSION>10500000) rc = MPIX_Comm_split_type(oldcomm, MPIX_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, newcomm); if (rc!=MPI_SUCCESS) return rc; #else /* This code was authored by Jim Dinan */ char my_name[MPI_MAX_PROCESSOR_NAME]; MPI_Comm node_comm = MPI_COMM_NULL; MPI_Comm parent_comm; int len; /* Dup so we don't leak communicators */ rc = MPI_Comm_dup(oldcomm, &parent_comm); if (rc!=MPI_SUCCESS) return rc; rc = MPI_Get_processor_name(my_name, &len); if (rc!=MPI_SUCCESS) return rc; while (node_comm == MPI_COMM_NULL) { char root_name[MPI_MAX_PROCESSOR_NAME]; int rank; MPI_Comm old_parent; rc = MPI_Comm_rank(parent_comm, &rank); if (rc!=MPI_SUCCESS) return rc; if (rank == 0) { rc = MPI_Bcast(my_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, parent_comm); if (rc!=MPI_SUCCESS) return rc; strncpy(root_name, my_name, MPI_MAX_PROCESSOR_NAME); } else { rc = MPI_Bcast(root_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, parent_comm); if (rc!=MPI_SUCCESS) return rc; } old_parent = parent_comm; if (strncmp(my_name, root_name, MPI_MAX_PROCESSOR_NAME) == 0) { /* My group splits off, I'm done after this */ rc = MPI_Comm_split(parent_comm, 1, rank, &node_comm); if (rc!=MPI_SUCCESS) return rc; } else { /* My group keeps going, separate from the others */ rc = MPI_Comm_split(parent_comm, 0, rank, &parent_comm); if (rc!=MPI_SUCCESS) return rc; } /* Old parent is no longer needed */ rc = MPI_Comm_free(&old_parent); if (rc!=MPI_SUCCESS) return rc; } *newcomm = node_comm; #endif return rc = MPI_SUCCESS; }
int main(int argc, char **argv) { int rank, nproc, i; double t_mcs_mtx; MPI_Comm mtx_comm; MCS_Mutex mcs_mtx; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); #ifdef USE_WIN_SHARED MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &mtx_comm); #else mtx_comm = MPI_COMM_WORLD; #endif MCS_Mutex_create(0, mtx_comm, &mcs_mtx); MPI_Barrier(MPI_COMM_WORLD); t_mcs_mtx = MPI_Wtime(); for (i = 0; i < NUM_ITER; i++) { /* Combining trylock and lock here is helpful for testing because it makes * CAS and Fetch-and-op contend for the tail pointer. */ #ifdef USE_CONTIGUOUS_RANK if (rank < nproc / 2) { #else if (rank % 2) { #endif int success = 0; while (!success) { MCS_Mutex_trylock(mcs_mtx, &success); } } else { MCS_Mutex_lock(mcs_mtx); } MCS_Mutex_unlock(mcs_mtx); } MPI_Barrier(MPI_COMM_WORLD); t_mcs_mtx = MPI_Wtime() - t_mcs_mtx; MCS_Mutex_free(&mcs_mtx); if (rank == 0) { if (verbose) { printf("Nproc %d, MCS Mtx = %f us\n", nproc, t_mcs_mtx / NUM_ITER * 1.0e6); } } if (mtx_comm != MPI_COMM_WORLD) MPI_Comm_free(&mtx_comm); MTest_Finalize(0); MPI_Finalize(); return 0; }
/* MAIN */ int main (int argc, char *argv[]) { /* to be used for hello world exchanges */ int rank, numtasks, namelen; char name[MPI_MAX_PROCESSOR_NAME]; /* related to MPI-3 shm*/ MPI_Comm shmcomm; /* shm communicator */ MPI_Win win; /* shm window object */ int *mem; /* shm memory to be allocated on each node */ int i0, i1; int* i2; /* for reading back from shm */ /* current rank exchanges hello world info with partners */ int partners[n_partners]; int *partners_map; /* mapping in shm communicator */ int **partners_ptrs; /* ptrs to shared mem window for each partner*/ int j, partner, alloc_len; int n_node_partners=0, n_inter_partners=0; /* non-blocking inter-node */ MPI_Request *reqs, *rq; int rbuf[n_partners]; /* recv buffer */ int req_num = 2; /* each inter-node echange needs a pair of MPI_Irecv and MPI_Isend */ if (getenv("1DRING_VERBOSE")) verbose = 1; /* Switch on/off printfs thru env */ MPI_Init (&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &numtasks); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Get_processor_name (name, &namelen); /* if (verbose) printf ("Hello world from COMM_WORLD: rank %d of %d is running on %s\n", rank, numtasks, name); */ /* The 1D ring is defined in partners array. It can be easily expanded to the higher order stencils. The current rank has 2 neighbours: previous and next, i.e., prev-rank-next */ partners[0] = rank-1; /* prev */ partners[1] = rank+1; /* next */ /* We will use periodic boundary conditions here */ if (rank == 0) partners[0] = numtasks - 1; if (rank == (numtasks - 1)) partners[1] = 0; /* MPI-3 SHM collective creates shm communicator */ MPI_Comm_split_type (MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); /* mapping: global rank -> shmcomm rank is in partners_map */ partners_map = (int*)malloc(n_partners*sizeof(int)); /* allocate partners_map */ translate_ranks(shmcomm, partners, partners_map); /* number of inter and intra node partners */ get_n_partners (rank, partners, partners_map, &n_node_partners, &n_inter_partners); if (verbose) print_n_partners (rank, partners, partners_map, n_node_partners, n_inter_partners); alloc_len = 2*sizeof(int) + namelen+1; /* the size of hello world info: 2 int and string; +1 for '\n' */ if (n_node_partners > 0) { /* allocate shared memory windows on each node for intra-node partners */ MPI_Win_allocate_shared (alloc_len, 1, MPI_INFO_NULL, shmcomm, /* inputs to MPI-3 SHM collective */ &mem, &win); /* outputs: mem - initial address of window; win - window object */ /* pointers to mem windows */ partners_ptrs = (int **)malloc(n_partners*sizeof(int*)); get_partners_ptrs (win, partners, partners_map, partners_ptrs ); } else { mem = (int *)malloc(alloc_len); } /* allocate MPI Request resources for inter-node comm. */ if(n_inter_partners > 0) { reqs = (MPI_Request*)malloc(req_num*n_inter_partners*sizeof(MPI_Request)); rq = reqs; } /* start halo exchange */ if (n_node_partners > 0) { /* Entering MPI-3 RMA access epoch required for MPI-3 shm */ MPI_Win_lock_all (MPI_MODE_NOCHECK, win); /* alternatively, MPI_Win_lock_all, MPI_Win_sync and MPI_Barrier can be replaced with 2 MPI_Win_fence calls surrounding update of shared memory. */ /* MPI_Win_fence(0, win); */ /* -- alternative */ } /* update MPI-3 shared memory (or local memory in case of lack of node partners) * by writing hello_world info into mem */ mem[0] = rank; mem[1] = numtasks; memcpy(mem+2, name, namelen); if (n_node_partners > 0) { /* MPI_Win_fence (0, win); */ /* -- alternative end */ MPI_Win_sync (win); /* memory fence to sync node exchanges */ MPI_Barrier (shmcomm); /* time barrier to make sure all ranks have updated their info */ } for (j=0; j<n_partners; j++) { if(partners_map[j] != MPI_UNDEFINED) /* partner j is on the same node */ { i0 = partners_ptrs[j][0]; /* load from MPI-3/SHM ops! */ i1 = partners_ptrs[j][1]; i2 = partners_ptrs[j]+2; if(verbose) printf ("load MPI/SHM values from neighbour => rank %d, numtasks %d on %s\n", i0, i1, i2); } else /* inter-node non-blocking MPI-1 */ { MPI_Irecv (&rbuf[j], 1, MPI_INT, partners[j], 1 , MPI_COMM_WORLD, rq++); MPI_Isend (&rank, 1, MPI_INT, partners[j], 1 , MPI_COMM_WORLD, rq++); } } /* sync inter-node exchanges and print out receive buffer rbuf*/ if(n_inter_partners > 0) { MPI_Waitall (req_num*n_inter_partners, reqs, MPI_STATUS_IGNORE); if(verbose){ for (j =0; j< n_partners;j++) if (partners_map[j] == MPI_UNDEFINED) printf("Recieved from my inter-node partner %d\n", rbuf[j]); } } if (n_node_partners > 0) { MPI_Win_unlock_all (win); /* close RMA epoch */ /* free resources */ MPI_Win_free (&win); free (partners_ptrs); } if (n_inter_partners) free (reqs); free (partners_map); MPI_Finalize (); return (0); }
int main(int argc, char ** argv) { int Block_order; size_t Block_size; size_t Colblock_size; int Tile_order=32; int tiling; int Num_procs; /* Number of ranks */ int order; /* overall matrix order */ int send_to, recv_from; /* communicating ranks */ size_t bytes; /* total amount of data to be moved */ int my_ID; /* rank */ int root=0; /* root rank of a communicator */ int iterations; /* number of times to run the pipeline algorithm */ int i, j, it, jt, ID;/* dummies */ int iter; /* index of iteration */ int phase; /* phase in the staged communication */ size_t colstart; /* sequence number of first column owned by calling rank */ int error=0; /* error flag */ double *A_p; /* original matrix column block */ double *B_p; /* transposed matrix column block */ double *Work_in_p; /* workspace for the transpose function */ double *Work_out_p;/* workspace for the transpose function */ double abserr, abserr_tot; /* computed error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Status status; /* completion status of message */ MPI_Win shm_win_A; /* Shared Memory window object */ MPI_Win shm_win_B; /* Shared Memory window object */ MPI_Win shm_win_Work_in; /* Shared Memory window object */ MPI_Win shm_win_Work_out; /* Shared Memory window object */ MPI_Info rma_winfo;/* info for window */ MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of ranks in shared domain */ int shm_ID; /* MPI rank within coherence domain */ int group_size; /* number of ranks per shared memory group */ int Num_groups; /* number of shared memory group */ int group_ID; /* sequence number of shared memory group */ int size_mul; /* size multiplier; 0 for non-root ranks in coherence domain*/ int istart; MPI_Request send_req, recv_req; /********************************************************************************* ** Initialize the MPI environment **********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); root = 0; /********************************************************************* ** process, test and broadcast input parameter *********************************************************************/ if (my_ID == root){ if (argc != 4 && argc !=5){ printf("Usage: %s <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atoi(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %d should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %d should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc == 5) Tile_order = atoi(*++argv); ENDOFTESTS:; } bail_out(error); /* Broadcast input data to all ranks */ MPI_Bcast(&order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM Matrix transpose: B = A^T\n"); printf("Number of ranks = %d\n", Num_procs); printf("Rank group size = %d\n", group_size); printf("Matrix order = %d\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); #ifndef SYNCHRONOUS printf("Non-"); #endif printf("Blocking messages\n"); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that a SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Num_groups = Num_procs/group_size; Block_order = order/Num_groups; group_ID = my_ID/group_size; colstart = Block_order * group_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /********************************************************************* ** Create the column block of the test matrix, the column block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ /* RMA win info */ MPI_Info_create(&rma_winfo); /* This key indicates that passive target RMA will not be used. * It is the one info key that MPICH actually uses for optimization. */ MPI_Info_set(rma_winfo, "no_locks", "true"); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); int offset = 32; MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &A_p, &shm_win_A); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_A); MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); A_p += offset; /* recompute memory size (overwritten by prior query */ size= (Colblock_size+offset)*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &B_p, &shm_win_B); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_B); MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p); if (B_p == NULL){ printf(" Error allocating space for transposed matrix by group %d\n",group_ID); error = 1; } bail_out(error); B_p += offset; if (Num_groups>1) { size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, (void *) &Work_in_p, &shm_win_Work_in); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_in); MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_in_p); if (Work_in_p == NULL){ printf(" Error allocating space for in block by group %d\n",group_ID); error = 1; } bail_out(error); /* recompute memory size (overwritten by prior query */ size = Block_size*sizeof(double)*size_mul; MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, (void *) &Work_out_p, &shm_win_Work_out); MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_out); MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, (void *)&Work_out_p); if (Work_out_p == NULL){ printf(" Error allocating space for out block by group %d\n",group_ID); error = 1; } bail_out(error); } /* Fill the original column matrix */ istart = 0; int chunk_size = Block_order/group_size; if (tiling) { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) { for (i=0;i<order; i+=Tile_order) for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++) for (it=i; it<MIN(order,i+Tile_order); it++) { A(it,jt) = (double) ((double)order*(jt+colstart) + it); B(it,jt) = -1.0; } } } else { for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) for (i=0;i<order; i++) { A(i,j) = (double)((double)order*(j+colstart) + i); B(i,j) = -1.0; } } /* NEED A STORE FENCE HERE */ MPI_Win_sync(shm_win_A); MPI_Win_sync(shm_win_B); MPI_Barrier(shm_comm); for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) { for (j=0; j<Block_order; j++) B(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) = A(it,jt); } } } for (phase=1; phase<Num_groups; phase++){ recv_from = ((group_ID + phase )%Num_groups); send_to = ((group_ID - phase + Num_groups)%Num_groups); istart = send_to*Block_order; if (!tiling) { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) for (j=0; j<Block_order; j++){ Work_out(j,i) = A(i,j); } } else { for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) for (j=0; j<Block_order; j+=Tile_order) for (it=i; it<MIN(Block_order,i+Tile_order); it++) for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(jt,it) = A(it,jt); } } /* NEED A LOAD/STORE FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); if (shm_ID==0) { #ifndef SYNCHRONOUS /* if we place the Irecv outside this block, it would not be protected by a local barrier, which creates a race */ MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req); MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, MPI_COMM_WORLD, &send_req); MPI_Wait(&recv_req, &status); MPI_Wait(&send_req, &status); #else MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, phase, Work_in_p, Block_size, MPI_DOUBLE, recv_from*group_size, phase, MPI_COMM_WORLD, &status); #endif } /* NEED A LOAD FENCE HERE */ MPI_Win_sync(shm_win_Work_in); MPI_Win_sync(shm_win_Work_out); MPI_Barrier(shm_comm); istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0; i<Block_order; i++) B(i,j) = Work_in(i,j); } /* end of phase loop */ } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; /* for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */ for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++) for (i=0;i<order; i++) { abserr += ABS(B(i,j) - (double)((double)order*i + j+colstart)); } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); #ifdef VERBOSE printf("Summed errors: %f \n", abserr_tot); #endif } else { printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); MPI_Win_unlock_all(shm_win_A); MPI_Win_unlock_all(shm_win_B); MPI_Win_free(&shm_win_A); MPI_Win_free(&shm_win_B); if (Num_groups>1) { MPI_Win_unlock_all(shm_win_Work_in); MPI_Win_unlock_all(shm_win_Work_out); MPI_Win_free(&shm_win_Work_in); MPI_Win_free(&shm_win_Work_out); } MPI_Info_free(&rma_winfo); MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char *argv[]){ MPI_Init(&argc, &argv); MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); int i, j, k, n, l, indx, signal; bool flag; double deltat; double time_spend; double begin, end; FILE* energy; FILE* grid; begin = MPI_Wtime(); //read in parameters if(!read_param()){ MPI_Comm_free(&shmcomm); MPI_Finalize(); return 1; } flag = true; dE = 1; el_old = 1; cycle = 0; deltat = (0.1 - dt) / increment; S = 0.25 * (1 + 3 * sqrt(1 - 8 / (3 * U))); // printf("Theoretical value is %lf.\n", third * (1 - third * U) * S * S - 2 * third * third * third * S * S * S * U + U / 9 * S * S * S * S ); if(myid == root){ printf("S is %lf.\n\n", S); //define droplet and boundary, introduce particles, initialize qtensor; if(!initial()){ flag = false; } energy = fopen("energy.out", "w"); fprintf(energy,"cycle\tEnergy_diff\tEnergy_ldg\tEnergy_el\tEnergy_ch\tEnergy_surf\tEnergy_tot\n"); fclose(energy); grid = fopen("grid.bin", "wb"); l = 0; for(l = 0; l < tot; l++){ if(boundary[l] || nboundary[l]) signal = 1; else if(drop[l]) signal = 0; else signal = -1; fwrite(&signal, sizeof(int), 1, grid); } fclose(grid); } //For all infomation initialized in root processor, scatter them to all other processors. if(!scatter()){ flag = false; return 1; } //Evolution while(flag){ //Every 1000 steps calculate energies. if(cycle % 1000 == 0){ free_energy(); if(fabs(dE) < accuracy){ flag = false; } } if(cycle % 10000 == 0){ //Every 10000 steps check the trace of Qtensor if(myid == root){ for(i = 0; i < droplet; i++){ if(!checktr(&q[i * 6])){ flag = false; printf("Error in the trace of q.\n"); } if(flag == false) break; } //print output file output(); } MPI_Bcast(&flag, 1, MPI_BYTE, root, MPI_COMM_WORLD); } //Wait until all the processors are ready and relax the system, first bulk and then boundary. MPI_Barrier(MPI_COMM_WORLD); MPI_Win_fence(0, win); if(flag) relax_bulk(); MPI_Barrier(MPI_COMM_WORLD); MPI_Win_fence(0, win); if(flag) relax_surf(); if(dt < 0.1){ dt += deltat; if(dt >= 0.1) dt = 0.1; } cycle ++; MPI_Barrier(MPI_COMM_WORLD); MPI_Win_fence(0, win); } free_energy(); end = MPI_Wtime(); if(myid == root){ output(); //Calculate time used. time_spend = (double)(end - begin) / 60.0; energy = fopen("energy.out", "a"); if(time_spend < 60){ fprintf(energy, "\nTime used: %lf min.\n", time_spend); printf("\nTime used: %lf min.\n", time_spend); } else{ fprintf(energy, "\nTime used: %lf h.\n", time_spend / 60.0); printf("\nTime used: %lf h.\n", time_spend / 60.0); } fclose(energy); } //deallocate dynamic arrays free_q(); MPI_Win_free(&win); MPI_Comm_free(&shmcomm); MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { int my_ID; /* rank */ int root; int m, n; /* grid dimensions */ double local_pipeline_time, /* timing parameters */ pipeline_time, avgtime; double epsilon = 1.e-8; /* error tolerance */ double corner_val; /* verification value at top right corner of grid */ int i, j, iter, ID;/* dummies */ int iterations; /* number of times to run the pipeline algorithm */ int *start, *end; /* starts and ends of grid slices */ int segment_size; int error=0; /* error flag */ int Num_procs; /* Number of ranks */ double *vector; /* array holding grid values */ long total_length; /* total required length to store grid values */ MPI_Status status; /* completion status of message */ MPI_Group shm_group, origin_group, target_group; int origin_ranks[1], target_ranks[1]; MPI_Aint nbr_segment_size; MPI_Win shm_win; /* Shared Memory window object */ MPI_Info rma_winfo; /* info for window */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of ranks in shared domain */ int shm_ID; /* MPI rank */ int source_disp; double *source_ptr; int p2pbuf; int width, nbr_width; /********************************************************************************* ** Initialize the MPI environment **********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /* we set root equal to highest rank, because this is also the rank that reports on the verification value */ root = Num_procs-1; /* Setup for Shared memory regions */ MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /********************************************************************* ** process, test and broadcast input parameter *********************************************************************/ if (my_ID == root){ if (argc != 4){ printf("Usage: %s <#iterations> <1st array dimension> <2nd array dimension>\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } m = atoi(*++argv); n = atoi(*++argv); if (m < 1 || n < 1){ printf("ERROR: grid dimensions must be positive: %d, %d \n", m, n); error = 1; goto ENDOFTESTS; } if (m<Num_procs) { printf("ERROR: First grid dimension %d smaller than number of ranks %d\n", m, Num_procs); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("MPI+SHM pipeline execution on 2D grid\n"); printf("Number of ranks = %i\n",Num_procs); printf("Grid sizes = %d, %d\n", m, n); printf("Number of iterations = %d\n", iterations); #ifdef VERBOSE printf("Synchronizations/iteration = %d\n", (Num_procs-1)*(n-1)); #endif } /* Broadcast benchmark data to all ranks */ MPI_Bcast(&m, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&n, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); start = (int *) malloc(2*Num_procs*sizeof(int)); if (!start) { printf("ERROR: Could not allocate space for array of slice boundaries\n"); exit(EXIT_FAILURE); } end = start + Num_procs; start[0] = 0; for (ID=0; ID<Num_procs; ID++) { segment_size = m/Num_procs; if (ID < (m%Num_procs)) segment_size++; if (ID>0) start[ID] = end[ID-1]+1; end[ID] = start[ID]+segment_size-1; } /* now set segment_size to the value needed by the calling rank */ segment_size = end[my_ID] - start[my_ID] + 1; /* RMA win info */ MPI_Info_create(&rma_winfo); /* This key indicates that passive target RMA will not be used. * It is the one info key that MPICH actually uses for optimization. */ MPI_Info_set(rma_winfo, "no_locks", "true"); /* total_length takes into account one ghost cell on left side of segment */ if (shm_ID == 0) { total_length = ((end[my_ID]-start[my_ID]+1)+1)*n; width = segment_size+1; } else { total_length = (end[my_ID]-start[my_ID]+1)*n; width = segment_size; } MPI_Win_allocate_shared(total_length*sizeof(double), sizeof(double), rma_winfo, shm_comm, (void *) &vector, &shm_win); if (vector == NULL) { printf("Could not allocate space for grid slice of %d by %d points", segment_size, n); printf(" on rank %d\n", my_ID); error = 1; } bail_out(error); /* Get left neighbor base address */ if (shm_ID > 0) { MPI_Win_shared_query(shm_win, shm_ID-1, &nbr_segment_size, &source_disp, &source_ptr); nbr_segment_size = end[my_ID-1] - start[my_ID-1] + 1; nbr_width = nbr_segment_size; } /* clear the array */ for (j=0; j<n; j++) for (i=start[my_ID]-1; i<=end[my_ID]; i++) { ARRAY(i-start[my_ID],j) = 0.0; } /* set boundary values (bottom and left side of grid */ if (my_ID==0) for (j=0; j<n; j++) ARRAY(0,j) = (double) j; for (i=start[my_ID]-1; i<=end[my_ID]; i++) ARRAY(i-start[my_ID],0) = (double) i; /* redefine start and end for calling rank to reflect local indices */ if (my_ID==0) start[my_ID] = 1; else start[my_ID] = 0; end[my_ID] = segment_size-1; for (iter=0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_pipeline_time = wtime(); } /* execute pipeline algorithm for grid lines 1 through n-1 (skip bottom line) */ for (j=1; j<n; j++) { /* if I am not at the left boundary, I need to wait for my left neighbor to send data */ if (my_ID > 0) { if (shm_ID > 0) { MPI_Recv(&p2pbuf, 0, MPI_INT, shm_ID-1, 1, shm_comm, &status); } else { MPI_Recv(&(ARRAY(start[my_ID]-1,j)), 1, MPI_DOUBLE, my_ID-1, j, MPI_COMM_WORLD, &status); } } i = start[my_ID]; if (shm_ID != 0) { ARRAY(i,j) = source_ptr[NBR_INDEX(end[my_ID],j)] + ARRAY(i,j-1) - source_ptr[NBR_INDEX(end[my_ID],j-1)]; i++; } for (; i<= end[my_ID]; i++) { ARRAY(i,j) = ARRAY(i-1,j) + ARRAY(i,j-1) - ARRAY(i-1,j-1); } /* if I am not on the right boundary, send data to my right neighbor */ if (my_ID != Num_procs-1) { if (shm_ID != shm_procs-1) { MPI_Send(&p2pbuf, 0, MPI_INT, shm_ID+1, 1, shm_comm); } else { MPI_Send(&(ARRAY(end[my_ID],j)), 1, MPI_DOUBLE, my_ID+1, j, MPI_COMM_WORLD); } } } /* copy top right corner value to bottom left corner to create dependency */ if (Num_procs >1) { if (my_ID==root) { corner_val = -ARRAY(end[my_ID],n-1); MPI_Send(&corner_val,1,MPI_DOUBLE,0,888,MPI_COMM_WORLD); } if (my_ID==0) { MPI_Recv(&(ARRAY(0,0)),1,MPI_DOUBLE,root,888,MPI_COMM_WORLD,&status); } } else ARRAY(0,0)= -ARRAY(end[my_ID],n-1); } local_pipeline_time = wtime() - local_pipeline_time; MPI_Reduce(&local_pipeline_time, &pipeline_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness, using top right value */ corner_val = (double) ((iterations+1)*(m+n-2)); if (my_ID == root) { if (abs(ARRAY(end[my_ID],n-1)-corner_val)/corner_val >= epsilon) { printf("ERROR: checksum %lf does not match verification value %lf\n", ARRAY(end[my_ID],n-1), corner_val); error = 1; } } bail_out(error); if (my_ID == root) { avgtime = pipeline_time/iterations; #ifdef VERBOSE printf("Solution validates; verification value = %lf\n", corner_val); printf("Point-to-point synchronizations/s: %lf\n", ((float)((n-1)*(Num_procs-1)))/(avgtime)); #else printf("Solution validates\n"); #endif printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0E-06 * 2 * ((double)((m-1)*(n-1)))/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
/// Set up MPI communication. This should be called in all processes /// before doing anything with this object, either directly or through /// the constructor that calls it. void MPIConnection::init( int * argc_p, char ** argv_p[] ) { int initialized = 0; MPI_CHECK( MPI_Initialized( &initialized ) ); if( !initialized ) { // // MPI Boilerplate derived from Grappa // MPI_CHECK( MPI_Init( argc_p, argv_p ) ); // get locale-local MPI communicator MPI_CHECK( MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &locale_communicator_ ) ); MPI_CHECK( MPI_Comm_set_errhandler( locale_communicator_, MPI_ERRORS_RETURN ) ); MPI_CHECK( MPI_Comm_rank( locale_communicator_, &locale_rank_ ) ); MPI_CHECK( MPI_Comm_size( locale_communicator_, &locale_size_ ) ); // get count of locales int32_t localesint = locale_rank == 0; // count one per locale and broadcast MPI_CHECK( MPI_Allreduce( MPI_IN_PLACE, &localesint, 1, MPI_INT32_T, MPI_SUM, MPI_COMM_WORLD ) ); locales_ = localesint; // get my locale int32_t mylocaleint = locale_rank == 0; // count one per locale and sum MPI_CHECK( MPI_Scan( MPI_IN_PLACE, &mylocaleint, 1, MPI_INT32_T, MPI_SUM, MPI_COMM_WORLD ) ); // copy to all cores in locale MPI_CHECK( MPI_Bcast( &mylocaleint, 1, MPI_INT32_T, 0, locale_communicator_ ) ); mylocaleint -= 1; // make zero-indexed locale_ = mylocaleint; // make new communicator with ranks laid out so that nodes hold adjacent ranks MPI_CHECK( MPI_Comm_split( MPI_COMM_WORLD, 0, mylocaleint, &main_communicator_ ) ); MPI_CHECK( MPI_Comm_set_errhandler( main_communicator_, MPI_ERRORS_RETURN ) ); int main_mycoreint = -1; int main_coresint = -1; MPI_CHECK( MPI_Comm_rank( main_communicator_, &main_mycoreint ) ); MPI_CHECK( MPI_Comm_size( main_communicator_, &main_coresint ) ); rank_ = main_mycoreint; size_ = main_coresint; // verify locale numbering is consistent with locales int32_t localemin = std::numeric_limits<int32_t>::max(); int32_t localemax = std::numeric_limits<int32_t>::min(); MPI_CHECK( MPI_Reduce( &mylocaleint, &localemin, 1, MPI_INT32_T, MPI_MIN, 0, locale_communicator_ ) ); MPI_CHECK( MPI_Reduce( &mylocaleint, &localemax, 1, MPI_INT32_T, MPI_MAX, 0, locale_communicator_ ) ); if( (0 == locale_rank_) && (localemin != localemax) ) { std::cerr << "Locale ID is not consistent across locale!\n"; exit(1); } // verify locale core count is the same across job int32_t locale_coresmin = std::numeric_limits<int32_t>::max(); int32_t locale_coresmax = std::numeric_limits<int32_t>::min(); MPI_CHECK( MPI_Reduce( &locale_size_, &locale_coresmin, 1, MPI_INT32_T, MPI_MIN, 0, main_communicator_ ) ); MPI_CHECK( MPI_Reduce( &locale_size_, &locale_coresmax, 1, MPI_INT32_T, MPI_MAX, 0, main_communicator_ ) ); if( 0 == rank_ && ( locale_coresmin != locale_coresmax ) ) { std::cerr << "Number of cores per locale is not the same across job!\n"; exit(1); } barrier(); } }