void benchmark (long * msg_buffer, int me, int pairs, int nxtpe, MPI_Win win) { static double mr, mr_sum; int iters; if (msg_buffer == NULL) { printf("Input buffer is NULL, no reason to proceed\n"); exit(-1); } /* * Warmup */ if (me < pairs) { for (int i = 0; i < ITERS_LARGE; i += 1) { MPI_Put ((msg_buffer + i*MAX_MSG_SZ), MAX_MSG_SZ, MPI_LONG, nxtpe, i*MAX_MSG_SZ, MAX_MSG_SZ, MPI_LONG, win); MPI_Win_flush_local (nxtpe, win); } } MPI_Win_flush_all(win); MPI_Barrier(MPI_COMM_WORLD); /* * Benchmark */ for (long size = 1; size <= MAX_MSG_SZ; size <<= 1) { iters = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE; mr = message_rate(msg_buffer, size, iters, me, pairs, nxtpe, win); MPI_Reduce(&mr, &mr_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); print_message_rate(size, mr_sum, me); } }
/** Execute sync_memory */ void _XMP_mpi_sync_memory() { if(_XMP_flag_multi_win){ int num = 0; _XMP_coarray_t **coarrays = _XMP_coarray_get_list(&num); for(int i = 0; i < num; i++){ MPI_Win win = coarrays[i]->win; if(win != MPI_WIN_NULL){ XACC_DEBUG("flush_all for host a coarray (%ld)", (long)win); MPI_Win_flush_all(win); XACC_DEBUG("sync for host a coarray (%ld)", (long)win); MPI_Win_sync(win); } #ifdef _XMP_XACC MPI_Win win_acc = coarrays[i]->win_acc; if(win_acc != MPI_WIN_NULL){ XACC_DEBUG("flush_all for acc a coarray (%ld)", (long)win_acc); MPI_Win_flush_all(win_acc); XACC_DEBUG("sync for acc a coarray (%ld)", (long)win_acc); MPI_Win_sync(win_acc); } #endif } }else{ if(! _is_coarray_win_flushed){ XACC_DEBUG("flush_all for host single coarray(%ld)", (long)_xmp_mpi_onesided_win); MPI_Win_flush_all(_xmp_mpi_onesided_win); _is_coarray_win_flushed = true; } if(! _is_distarray_win_flushed){ XACC_DEBUG("flush_all for host single distarray(%ld)", (long)_xmp_mpi_distarray_win); MPI_Win_flush_all(_xmp_mpi_distarray_win); _is_distarray_win_flushed = true; } #ifdef _XMP_XACC if(! _is_coarray_win_acc_flushed){ XACC_DEBUG("flush_all for acc single coarray(%ld)", (long)_xmp_mpi_onesided_win_acc); MPI_Win_flush_all(_xmp_mpi_onesided_win_acc); _is_coarray_win_acc_flushed = true; } if(! _is_distarray_win_acc_flushed){ XACC_DEBUG("flush_all for acc single distarray(%ld)", (long)_xmp_mpi_distarray_win_acc); MPI_Win_flush_all(_xmp_mpi_distarray_win_acc); _is_distarray_win_acc_flushed = true; } #endif _win_sync(); } }
MTEST_THREAD_RETURN_TYPE run_test(void *arg) { int i; double *local_b; MPI_Alloc_mem(COUNT * sizeof(double), MPI_INFO_NULL, &local_b); for (i = 0; i < LOOPS; i++) { MPI_Get(local_b, COUNT, MPI_DOUBLE, 0, 0, COUNT, MPI_DOUBLE, win); MPI_Win_flush_all(win); } MPI_Free_mem(local_b); return (MTEST_THREAD_RETURN_TYPE) NULL; }
dart_ret_t dart_flush_all( dart_gptr_t gptr) { int16_t seg_id; seg_id = gptr.segid; MPI_Win win; DART_LOG_DEBUG("dart_flush_all() gptr: " "unitid:%d offset:%"PRIu64" segid:%d index:%d", gptr.unitid, gptr.addr_or_offs.offset, gptr.segid, gptr.flags); if (seg_id) { uint16_t index = gptr.flags; win = dart_win_lists[index]; } else { win = dart_win_local_alloc; } DART_LOG_TRACE("dart_flush_all: MPI_Win_flush_all"); MPI_Win_flush_all(win); DART_LOG_DEBUG("dart_flush_all > finished"); return DART_OK; }
/* garray_put() */ int64_t garray_put(garray_t *ga, int64_t *lo, int64_t *hi, void *buf_) { int64_t count = (hi[0] - lo[0]) + 1, length = count * ga->elem_size, tlonid, tloidx, thinid, thiidx, tnid, tidx, n, oidx = 0; int8_t *buf = (int8_t *)buf_; calc_target(ga, lo[0], &tlonid, &tloidx); calc_target(ga, hi[0], &thinid, &thiidx); /* is all data going to the same target? */ if (tlonid == thinid) { LOG_DEBUG(ga->g->glog, "[%d] garray put %ld-%ld, single target %ld.%ld\n", ga->g->nid, lo[0], hi[0], tlonid, tloidx); //MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tlonid, 0, ga->win); MPI_Put(buf, length, MPI_INT8_T, tlonid, (tloidx * ga->elem_size), length, MPI_INT8_T, ga->win); //MPI_Win_unlock(tlonid, ga->win); MPI_Win_flush(tlonid, ga->win); return 0; } /* put the data into the lo nid */ n = ga->nelems_per_node + (tlonid < ga->nextra_elems ? 1 : 0) - tloidx; LOG_DEBUG(ga->g->glog, "[%d] garray putting %ld elements into %ld.%ld\n", ga->g->nid, n, tlonid, tloidx); //MPI_Win_lock(MPI_LOCK_SHARED, tlonid, 0, ga->win); MPI_Put(buf, length, MPI_INT8_T, tlonid, (tloidx * ga->elem_size), (n * ga->elem_size), MPI_INT8_T, ga->win); //MPI_Win_unlock(tlonid, ga->win); oidx = (n*ga->elem_size); /* put the data into the in-between nids */ tidx = 0; for (tnid = tlonid + 1; tnid < thinid; ++tnid) { n = ga->nelems_per_node + (tnid < ga->nextra_elems ? 1 : 0); LOG_DEBUG(ga->g->glog, "[%d] garray putting %ld elements into %ld.%ld\n", ga->g->nid, n, tnid, tidx); //MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tnid, 0, ga->win); MPI_Put(&buf[oidx], (n * ga->elem_size), MPI_INT8_T, tnid, 0, (n * ga->elem_size), MPI_INT8_T, ga->win); //MPI_Win_unlock(tnid, ga->win); oidx += (n*ga->elem_size); } /* put the data into the hi nid */ n = thiidx + 1; LOG_DEBUG(ga->g->glog, "[%d] garray putting %ld elements up to %ld.%ld\n", ga->g->nid, n, thinid, thiidx); //MPI_Win_lock(MPI_LOCK_EXCLUSIVE, thinid, 0, ga->win); MPI_Put(&buf[oidx], (n * ga->elem_size), MPI_INT8_T, thinid, 0, (n * ga->elem_size), MPI_INT8_T, ga->win); //MPI_Win_unlock(thinid, ga->win); MPI_Win_flush_all(ga->win); return 0; }
static int run_test(int nop) { int i, x, errs = 0, errs_total = 0; MPI_Status stat; int dst; int winbuf_offset = 0; double t0, avg_total_time = 0.0, t_total = 0.0; double sum = 0.0; if (nprocs <= NPROCS_M) { ITER = ITER_S; } else { ITER = ITER_L; } target_computation_init(); MPI_Win_lock_all(0, win); t0 = MPI_Wtime(); for (x = 0; x < ITER; x++) { // send to all the left processes in a ring style for (dst = (rank + 1) % nprocs; dst != rank; dst = (dst + 1) % nprocs) { MPI_Accumulate(&locbuf[0], 1, MPI_DOUBLE, dst, rank, 1, MPI_DOUBLE, MPI_SUM, win); } MPI_Win_flush_all(win); target_computation(); for (dst = (rank + 1) % nprocs; dst != rank; dst = (dst + 1) % nprocs) { for (i = 1; i < nop; i++) { MPI_Accumulate(&locbuf[i], 1, MPI_DOUBLE, dst, rank, 1, MPI_DOUBLE, MPI_SUM, win); } } MPI_Win_flush_all(win); debug_printf("[%d]MPI_Win_flush all done\n", x); } t_total += MPI_Wtime() - t0; t_total /= ITER; MPI_Win_unlock_all(win); MPI_Barrier(MPI_COMM_WORLD); target_computation_exit(); #ifdef CHECK MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win); sum = 0.0; for (i = 0; i < nop; i++) { sum += locbuf[i]; } sum *= ITER; for (i = 0; i < nprocs; i++) { if (i == rank) continue; if (winbuf[i] != sum) { fprintf(stderr, "[%d]computation error : winbuf[%d] %.2lf != %.2lf, nop %d\n", rank, i, winbuf[i], sum, nop); errs += 1; } } MPI_Win_unlock(rank, win); #endif MPI_Reduce(&t_total, &avg_total_time, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Allreduce(&errs, &errs_total, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { avg_total_time /= nprocs; #ifdef MTCORE fprintf(stdout, "mtcore: comp_size %d num_op %d nprocs %d total_time %lf\n", DGEMM_SIZE, nop, nprocs, avg_total_time); #else fprintf(stdout, "orig: comp_size %d num_op %d nprocs %d total_time %lf\n", DGEMM_SIZE, nop, nprocs, avg_total_time); #endif } return errs_total; }
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ long Block_size; /* size of a single block */ long Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ long order; /* order of overall matrix */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ int i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double RESTRICT *A_p; /* original matrix column block */ double RESTRICT *B_p; /* transposed matrix column block */ double RESTRICT *Work_in_p;/* workspace for transpose function */ double RESTRICT *Work_out_p;/* workspace for transpose function */ double abserr, /* absolute error */ abserr_tot; /* aggregate absolute error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Win rma_win = MPI_WIN_NULL; MPI_Info rma_winfo = MPI_INFO_NULL; int passive_target = 0; /* use passive target RMA sync */ #if MPI_VERSION >= 3 int flush_local = 1; /* flush local (or remote) after put */ int flush_bundle = 1; /* flush every <bundle> put calls */ #endif /********************************************************************* ** Initialize the MPI environment *********************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPIRMA matrix transpose: B = A^T\n"); if (argc <= 3){ printf("Usage: %s <# iterations> <matrix order> [Tile size]" "[sync (0=fence, 1=flush)] [flush local?] [flush bundle]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atol(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %ld should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %ld should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc >= 4) Tile_order = atoi(*++argv); if (argc >= 5) passive_target = atoi(*++argv); #if MPI_VERSION >= 3 if (argc >= 6) flush_local = atoi(*++argv); if (argc >= 7) flush_bundle = atoi(*++argv); #endif ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %ld\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); if (passive_target) { #if MPI_VERSION < 3 printf("Synchronization = MPI_Win_(un)lock\n"); #else printf("Synchronization = MPI_Win_flush%s (bundle=%d)\n", flush_local ? "_local" : "", flush_bundle); #endif } else { printf("Synchronization = MPI_Win_fence\n"); } } /* Broadcast input data to all ranks */ MPI_Bcast (&order, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast (&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&passive_target, 1, MPI_INT, root, MPI_COMM_WORLD); #if MPI_VERSION >= 3 MPI_Bcast (&flush_local, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&flush_bundle, 1, MPI_INT, root, MPI_COMM_WORLD); #endif /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /* debug message size effects */ if (my_ID == root) { printf("Block_size = %ld\n", Block_size); } /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); MPI_Info_create (&rma_winfo); MPI_Info_set (rma_winfo, "no locks", "true"); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_out_p = (double *) prk_malloc(Block_size*(Num_procs-1)*sizeof(double)); if (Work_out_p == NULL){ printf(" Error allocating space for work_out on node %d\n",my_ID); error = 1; } bail_out(error); PRK_Win_allocate(Block_size*(Num_procs-1)*sizeof(double), sizeof(double), rma_winfo, MPI_COMM_WORLD, &Work_in_p, &rma_win); if (Work_in_p == NULL){ printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } #if MPI_VERSION >= 3 if (passive_target && Num_procs>1) { MPI_Win_lock_all(MPI_MODE_NOCHECK,rma_win); } #endif /* Fill the original column matrix */ istart = 0; for (j=0;j<Block_order;j++) { for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } } MPI_Barrier(MPI_COMM_WORLD); for (iter = 0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) { for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } } else { for (i=0; i<Block_order; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) { for (it=i; it<MIN(Block_order,i+Tile_order); it++) { for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } } } } if (!passive_target && Num_procs>1) { MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, rma_win); } for (phase=1; phase<Num_procs; phase++){ send_to = (my_ID - phase + Num_procs)%Num_procs; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) { for (j=0; j<Block_order; j++) { Work_out(phase-1,j,i) = A(i,j); A(i,j) += 1.0; } } } else { for (i=0; i<Block_order; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) { for (it=i; it<MIN(Block_order,i+Tile_order); it++) { for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(phase-1,jt,it) = A(it,jt); A(it,jt) += 1.0; } } } } } #if MPI_VERSION < 3 if (passive_target) { MPI_Win_lock(MPI_LOCK_SHARED, send_to, MPI_MODE_NOCHECK, rma_win); } #endif MPI_Put(Work_out_p+Block_size*(phase-1), Block_size, MPI_DOUBLE, send_to, Block_size*(phase-1), Block_size, MPI_DOUBLE, rma_win); if (passive_target) { #if MPI_VERSION < 3 MPI_Win_unlock(send_to, rma_win); #else if (flush_bundle==1) { if (flush_local==1) { MPI_Win_flush_local(send_to, rma_win); } else { MPI_Win_flush(send_to, rma_win); } } else if ( (phase%flush_bundle) == 0) { /* Too lazy to record all targets, so let MPI do it internally (hopefully) */ if (flush_local==1) { MPI_Win_flush_local_all(rma_win); } else { MPI_Win_flush_all(rma_win); } } #endif } } /* end of phase loop for puts */ if (Num_procs>1) { if (passive_target) { #if MPI_VERSION >= 3 MPI_Win_flush_all(rma_win); #endif MPI_Barrier(MPI_COMM_WORLD); } else { MPI_Win_fence(MPI_MODE_NOSTORE, rma_win); } } for (phase=1; phase<Num_procs; phase++) { recv_from = (my_ID + phase)%Num_procs; istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) { for (i=0; i<Block_order; i++) { B(i,j) += Work_in(phase-1,i,j); } } } /* end of phase loop for scatters */ /* for the flush case we need to make sure we have consumed Work_in before overwriting it in the next iteration */ if (Num_procs>1 && passive_target) { MPI_Barrier(MPI_COMM_WORLD); } } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) { for (i=0;i<order; i++) { abserr += ABS(B(i,j) - ((double)(order*i + j+colstart)*(iterations+1)+addit)); } } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); } else { printf("ERROR: Aggregate absolute error %lf exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); if (rma_win!=MPI_WIN_NULL) { #if MPI_VERSION >=3 if (passive_target) { MPI_Win_unlock_all(rma_win); } #endif PRK_Win_free(&rma_win); } MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char *argv[]) { int err, errs = 0; int rank, size; int minsize = 2, count; int i, j; MPI_Aint origcount, targetcount; MPI_Aint bufsize; MPI_Comm comm; MPI_Win win; MPI_Aint lb, extent; MPI_Datatype origtype, targettype; DTP_t orig_dtp, target_dtp; void *origbuf, *targetbuf; MTest_Init(&argc, &argv); #ifndef USE_DTP_POOL_TYPE__STRUCT /* set in 'test/mpi/structtypetest.txt' to split tests */ MPI_Datatype basic_type; int len; char type_name[MPI_MAX_OBJECT_NAME] = { 0 }; err = MTestInitBasicSignature(argc, argv, &count, &basic_type); if (err) return MTestReturnValue(1); /* compute bufsize to limit number of comm in test */ MPI_Type_get_extent(basic_type, &lb, &extent); bufsize = extent * count; err = DTP_pool_create(basic_type, count, &orig_dtp); if (err != DTP_SUCCESS) { MPI_Type_get_name(basic_type, type_name, &len); fprintf(stdout, "Error while creating orig pool (%s,%d)\n", type_name, count); fflush(stdout); } err = DTP_pool_create(basic_type, count, &target_dtp); if (err != DTP_SUCCESS) { MPI_Type_get_name(basic_type, type_name, &len); fprintf(stdout, "Error while creating target pool (%s,%d)\n", type_name, count); fflush(stdout); } #else MPI_Datatype *basic_types = NULL; int *basic_type_counts = NULL; int basic_type_num; err = MTestInitStructSignature(argc, argv, &basic_type_num, &basic_type_counts, &basic_types); if (err) return MTestReturnValue(1); /* TODO: ignore bufsize for structs for now; * we need to compute bufsize also for * this case */ bufsize = 0; err = DTP_pool_create_struct(basic_type_num, basic_types, basic_type_counts, &orig_dtp); if (err != DTP_SUCCESS) { fprintf(stdout, "Error while creating struct pool\n"); fflush(stdout); } err = DTP_pool_create_struct(basic_type_num, basic_types, basic_type_counts, &target_dtp); if (err != DTP_SUCCESS) { fprintf(stdout, "Error while creating struct pool\n"); fflush(stdout); } /* this is ignored */ count = 0; #endif while (MTestGetIntracommGeneral(&comm, minsize, 1)) { if (comm == MPI_COMM_NULL) continue; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); int orig = 0; for (i = 0; i < target_dtp->DTP_num_objs; i++) { err = DTP_obj_create(target_dtp, i, 0, 0, 0); if (err != DTP_SUCCESS) { errs++; break; } targetcount = target_dtp->DTP_obj_array[i].DTP_obj_count; targettype = target_dtp->DTP_obj_array[i].DTP_obj_type; targetbuf = target_dtp->DTP_obj_array[i].DTP_obj_buf; MPI_Type_get_extent(targettype, &lb, &extent); MPI_Win_create(targetbuf, lb + targetcount * extent, (int) extent, MPI_INFO_NULL, comm, &win); for (j = 0; j < orig_dtp->DTP_num_objs; j++) { err = DTP_obj_create(orig_dtp, j, 0, 1, count); if (err != DTP_SUCCESS) { errs++; break; } origcount = orig_dtp->DTP_obj_array[j].DTP_obj_count; origtype = orig_dtp->DTP_obj_array[j].DTP_obj_type; origbuf = orig_dtp->DTP_obj_array[j].DTP_obj_buf; if (rank == orig) { int target; MPI_Win_lock_all(0, win); for (target = 0; target < size; target++) if (target != orig) { MPI_Accumulate(origbuf, origcount, origtype, target, 0, targetcount, targettype, MPI_REPLACE, win); } MPI_Win_flush_all(win); /*signal to target that the ops are flushed so that it starts checking the result */ MPI_Barrier(comm); /*make sure target finishes checking the result before issuing unlock */ MPI_Barrier(comm); MPI_Win_unlock_all(win); char *resbuf = (char *) calloc(lb + extent * targetcount, sizeof(char)); /*wait for the destination to finish checking and reinitializing the buffer */ MPI_Barrier(comm); MPI_Win_lock_all(0, win); for (target = 0; target < size; target++) if (target != orig) { MPI_Get_accumulate(origbuf, origcount, origtype, resbuf, targetcount, targettype, target, 0, targetcount, targettype, MPI_REPLACE, win); } MPI_Win_flush_all(win); /*signal to target that the ops are flushed so that it starts checking the result */ MPI_Barrier(comm); /*make sure target finishes checking the result before issuing unlock */ MPI_Barrier(comm); MPI_Win_unlock_all(win); free(resbuf); } else { /* TODO: add a DTP_buf_set() function to replace this */ char *tmp = (char *) calloc(lb + extent * targetcount, sizeof(char)); memcpy(tmp, targetbuf, lb + extent * targetcount); MPI_Barrier(comm); MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win); err = DTP_obj_buf_check(target_dtp, i, 0, 1, count); if (err != DTP_SUCCESS) { errs++; } /* restore target buffer */ memcpy(targetbuf, tmp, lb + extent * targetcount); free(tmp); MPI_Barrier(comm); MPI_Win_unlock(rank, win); /*signal the source that checking and reinitialization is done */ MPI_Barrier(comm); MPI_Barrier(comm); MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win); err = DTP_obj_buf_check(target_dtp, i, 0, 1, count); if (err != DTP_SUCCESS) { errs++; } MPI_Barrier(comm); MPI_Win_unlock(rank, win); } DTP_obj_free(orig_dtp, j); } MPI_Win_free(&win); DTP_obj_free(target_dtp, i); } MTestFreeComm(&comm); /* for large buffers only do one communicator */ if (MAX_COUNT_SIZE * MAX_TYPE_SIZE < bufsize) { break; } } DTP_pool_free(orig_dtp); DTP_pool_free(target_dtp); #ifdef USE_DTP_POOL_TYPE__STRUCT /* cleanup array if any */ if (basic_types) { free(basic_types); } if (basic_type_counts) { free(basic_type_counts); } #endif MTest_Finalize(errs); return MTestReturnValue(errs); }
int main(int argc, char **argv) { FILE *fp, *fp2; char testName[32] = "MPI_Rget", file1[64], file2[64]; int dblSize, proc, nprocs, npairs, partner; unsigned int i, j, k, size, localSize, NLOOP = NLOOP_MAX; unsigned int smin = MIN_P2P_SIZE, smed = MED_P2P_SIZE, smax = MAX_P2P_SIZE; double tScale = USEC, bwScale = MB_8; double tStart, timeMin, timeMinGlobal, overhead, threshold_lo, threshold_hi; double msgBytes, sizeBytes, localMax, UsedMem; double tElapsed[NREPS], tElapsedGlobal[NREPS]; double *A, *B; MPI_Win win; MPI_Status stat; MPI_Request req; // Initialize parallel environment MPI_Init(&argc, &argv); MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); MPI_Comm_rank( MPI_COMM_WORLD, &proc ); // Test input parameters if( nprocs%2 != 0 && proc == 0 ) fatalError( "P2P test requires an even number of processors" ); // Check for user defined limits checkEnvP2P( proc, &NLOOP, &smin, &smed, &smax ); // Initialize local variables localMax = 0.0; npairs = nprocs/2; if( proc < npairs ) partner = proc + npairs; if( proc >= npairs ) partner = proc - npairs; UsedMem = (double)smax*(double)sizeof(double)*2.0; // Allocate and initialize arrays srand( SEED ); A = doubleVector( smax ); B = doubleVector( smax ); // Open output file and write header if( proc == 0 ){ // Check timer overhead in seconds timerTest( &overhead, &threshold_lo, &threshold_hi ); // Open output files and write headers sprintf( file1, "rget_time-np_%.4d.dat", nprocs ); sprintf( file2, "rget_bw-np_%.4d.dat", nprocs ); fp = fopen( file1, "a" ); fp2 = fopen( file2, "a" ); printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo ); } // Get type size MPI_Type_size( MPI_DOUBLE, &dblSize ); // Set up a window for RMA MPI_Win_create( A, smax*dblSize, dblSize, MPI_INFO_NULL, MPI_COMM_WORLD, &win ); MPI_Win_lock_all( 0, win ); //================================================================ // Single loop with minimum size to verify that inner loop length // is long enough for the timings to be accurate //================================================================ // Warmup with a medium size message if( proc < npairs ){ MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req ); MPI_Wait( &req, &stat ); MPI_Win_flush_all( win ); } // Test if current NLOOP is enough to capture fastest test cases MPI_Barrier( MPI_COMM_WORLD ); tStart = benchTimer(); if( proc < npairs ){ for(j = 0; j < NLOOP; j++){ MPI_Rget( B, smin, MPI_DOUBLE, partner, 0, smin, MPI_DOUBLE, win, &req ); MPI_Wait( &req, &stat ); MPI_Win_flush_all( win ); } } timeMin = benchTimer() - tStart; MPI_Reduce( &timeMin, &timeMinGlobal, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD ); if( proc == 0 ) resetInnerLoop( timeMinGlobal, threshold_lo, &NLOOP ); MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD ); //================================================================ // Execute test for each requested size //================================================================ for( size = smin; size <= smax; size = size*2 ){ // Warmup with a medium size message if( proc < npairs ){ MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req ); MPI_Wait( &req, &stat ); MPI_Win_flush_all( win ); } // Repeat NREPS to collect statistics for(i = 0; i < NREPS; i++){ MPI_Barrier( MPI_COMM_WORLD ); tStart = benchTimer(); if( proc < npairs ){ for(j = 0; j < NLOOP; j++){ MPI_Rget( B, size, MPI_DOUBLE, partner, 0, size, MPI_DOUBLE, win, &req ); MPI_Wait( &req, &stat ); MPI_Win_flush_all( win ); } } tElapsed[i] = benchTimer() - tStart; } MPI_Reduce( tElapsed, tElapsedGlobal, NREPS, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD ); // Only task 0 needs to do the analysis of the collected data if( proc == 0 ){ // sizeBytes is size to write to file // msgBytes is actual data exchanged on the wire msgBytes = (double)size*(double)npairs*(double)dblSize; sizeBytes = (double)size*(double)dblSize; post_process( fp, fp2, threshold_hi, tElapsedGlobal, tScale, bwScale, size*dblSize, sizeBytes, msgBytes, &NLOOP, &localMax, &localSize ); } MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD ); } MPI_Win_unlock_all( win ); MPI_Win_free( &win ); MPI_Barrier( MPI_COMM_WORLD ); free( A ); free( B ); //================================================================ // Print completion message, free memory and exit //================================================================ if( proc == 0 ){ printSummary( fp2, testName, localMax, localSize ); fclose( fp2 ); fclose( fp ); } MPI_Finalize(); return 0; }