dart_ret_t dart_flush_local( dart_gptr_t gptr) { dart_unit_t target_unitid_abs; int16_t seg_id = gptr.segid; MPI_Win win; target_unitid_abs = gptr.unitid; DART_LOG_DEBUG("dart_flush_local() gptr: " "unitid:%d offset:%"PRIu64" segid:%d index:%d", gptr.unitid, gptr.addr_or_offs.offset, gptr.segid, gptr.flags); if (seg_id) { uint16_t index = gptr.flags; dart_unit_t target_unitid_rel; win = dart_win_lists[index]; DART_LOG_DEBUG("dart_flush_local() win:%"PRIu64" seg:%d unit:%d", (uint64_t)win, seg_id, target_unitid_abs); unit_g2l(index, target_unitid_abs, &target_unitid_rel); DART_LOG_TRACE("dart_flush_local: MPI_Win_flush_local"); MPI_Win_flush_local(target_unitid_rel, win); } else { win = dart_win_local_alloc; DART_LOG_DEBUG("dart_flush_local() lwin:%"PRIu64" seg:%d unit:%d", (uint64_t)win, seg_id, target_unitid_abs); DART_LOG_TRACE("dart_flush_local: MPI_Win_flush_local"); MPI_Win_flush_local(target_unitid_abs, win); } DART_LOG_DEBUG("dart_flush_local > finished"); return DART_OK; }
int main(int argc, char **argv){ int i, me, target; unsigned int size; double t, t_max; MPI_Win win; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &me); MPI_Win_create(&send_buf, sizeof(char)*MAX_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win); target = 1 - me; MPI_Win_lock_all(0, win); init_buf(send_buf, me); if(me==0) print_items(); for(size=1;size<MAX_SIZE+1;size*=2){ MPI_Barrier(MPI_COMM_WORLD); for(i=0;i<LOOP+WARMUP;i++){ if(WARMUP == i) t = wtime(); if(me == 0){ MPI_Put(send_buf, size, MPI_CHAR, target, 0, size, MPI_CHAR, win); MPI_Win_flush_local(target, win); while(send_buf[0] == '0' || send_buf[size-1] == '0'){ MPI_Win_flush(me, win); } send_buf[0] = '0'; send_buf[size-1] = '0'; } else { while(send_buf[0] == '1' || send_buf[size-1] == '1'){ MPI_Win_flush(me, win); } send_buf[0] = '1'; send_buf[size-1] = '1'; MPI_Put(send_buf, size, MPI_CHAR, target, 0, size, MPI_CHAR, win); MPI_Win_flush_local(target, win); } } //end of LOOP t = wtime() - t; MPI_Reduce(&t, &t_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if(me == 0) print_results(size, t_max); } MPI_Win_unlock_all(win); MPI_Win_free(&win); MPI_Finalize(); return 0; }
void benchmark (long * msg_buffer, int me, int pairs, int nxtpe, MPI_Win win) { static double mr, mr_sum; int iters; if (msg_buffer == NULL) { printf("Input buffer is NULL, no reason to proceed\n"); exit(-1); } /* * Warmup */ if (me < pairs) { for (int i = 0; i < ITERS_LARGE; i += 1) { MPI_Put ((msg_buffer + i*MAX_MSG_SZ), MAX_MSG_SZ, MPI_LONG, nxtpe, i*MAX_MSG_SZ, MAX_MSG_SZ, MPI_LONG, win); MPI_Win_flush_local (nxtpe, win); } } MPI_Win_flush_all(win); MPI_Barrier(MPI_COMM_WORLD); /* * Benchmark */ for (long size = 1; size <= MAX_MSG_SZ; size <<= 1) { iters = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE; mr = message_rate(msg_buffer, size, iters, me, pairs, nxtpe, win); MPI_Reduce(&mr, &mr_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); print_message_rate(size, mr_sum, me); } }
void ompi_win_flush_local_f(MPI_Fint *rank, MPI_Fint *win, MPI_Fint *ierr) { int c_ierr; MPI_Win c_win = MPI_Win_f2c(*win); c_ierr = MPI_Win_flush_local(OMPI_FINT_2_INT(*rank), c_win); if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr); }
static inline void _wait_gets(const int target_rank, const MPI_Win win) { if(_is_get_blocking){ XACC_DEBUG("flush_local(%d) for [host|acc]", target_rank); MPI_Win_flush_local(target_rank, win); } }
long shmemx_ct_get(shmemx_ct_t ct) { #ifdef ENABLE_SMP_OPTIMIZATIONS if (shmem_world_is_smp) { return __sync_fetch_and_add(ct,0); } else #endif { shmem_offset_t win_offset = (ptrdiff_t)((intptr_t)ct - (intptr_t)shmem_sheap_base_ptr); long output; MPI_Fetch_and_op(NULL, &output, MPI_LONG, shmem_world_rank, win_offset, MPI_NO_OP, shmem_sheap_win); MPI_Win_flush_local(shmem_world_rank, shmem_sheap_win); return output; } }
/*Run Get_accumulate with flush local*/ void run_get_acc_with_flush_local(int rank, WINDOW type) { int size, i; MPI_Aint disp = 0; MPI_Win win; for (size = 0; size <= MAX_SIZE; size = (size ? size * 2 : size + 1)) { allocate_memory(rank, rbuf, size, type, &win); if (type == WIN_DYNAMIC) { disp = sdisp_remote; } if(size > LARGE_MESSAGE_SIZE) { loop = LOOP_LARGE; skip = SKIP_LARGE; } if(rank == 0) { MPI_CHECK(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 1, 0, win)); for (i = 0; i < skip + loop; i++) { if (i == skip) { t_start = MPI_Wtime (); } MPI_CHECK(MPI_Get_accumulate(sbuf, size, MPI_CHAR, cbuf, size, MPI_CHAR, 1, disp, size, MPI_CHAR, MPI_SUM, win)); MPI_CHECK(MPI_Win_flush_local(1, win)); } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); print_latency(rank, size); MPI_Win_free(&win); } }
static void _mpi_scalar_mget(const int target_rank, void *dst, const _XMP_coarray_t *src_desc, const size_t dst_offset, const size_t src_offset, const int dst_dims, const _XMP_array_section_t *dst_info, const bool is_src_on_acc) { char *laddr = (char*)dst + dst_offset; char *raddr = get_remote_addr(src_desc, target_rank, is_src_on_acc) + src_offset; MPI_Win win = get_window(src_desc, is_src_on_acc); size_t transfer_size = src_desc->elmt_size; XACC_DEBUG("scalar_mget(local_p=%p, size=%zd, target=%d, remote_p=%p, is_acc=%d)", laddr, transfer_size, target_rank, raddr, is_src_on_acc); MPI_Get((void*)laddr, transfer_size, MPI_BYTE, target_rank, (MPI_Aint)raddr, transfer_size, MPI_BYTE, win); //we have to wait completion of the get XACC_DEBUG("flush_local(%d) for [host|acc]", target_rank); MPI_Win_flush_local(target_rank, win); _unpack_scalar((char*)dst, dst_dims, laddr, dst_info); }
/*Run FOP with flush local*/ void run_fop_with_flush_local (int rank, WINDOW type) { int i; MPI_Win win; MPI_Aint disp = 0; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); allocate_atomic_memory(rank, sbuf_original, rbuf_original, tbuf_original, NULL, (char **)&sbuf, (char **)&rbuf, (char **)&tbuf, NULL, (char **)&rbuf, MAX_MSG_SIZE, type, &win); if(rank == 0) { if (type == WIN_DYNAMIC) { disp = disp_remote; } MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win)); for (i = 0; i < skip + loop; i++) { if (i == skip) { t_start = MPI_Wtime (); } MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win)); MPI_CHECK(MPI_Win_flush_local(1, win)); } t_end = MPI_Wtime (); MPI_CHECK(MPI_Win_unlock(1, win)); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); print_latency(rank, 8); free_atomic_memory (sbuf, rbuf, tbuf, NULL, win, rank); }
int main(int argc, char *argv[]) { int rank, nproc; int i; MPI_Win win; int *tar_buf = NULL; int *orig_buf = NULL; MPI_Datatype derived_dtp; int errors = 0; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nproc); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (nproc < 3) { fprintf(stderr, "Run this program with at least 3 processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Alloc_mem(sizeof(int) * DATA_SIZE, MPI_INFO_NULL, &orig_buf); MPI_Alloc_mem(sizeof(int) * DATA_SIZE, MPI_INFO_NULL, &tar_buf); for (i = 0; i < DATA_SIZE; i++) { orig_buf[i] = 1; tar_buf[i] = 0; } MPI_Type_vector(COUNT, BLOCKLENGTH - 1, STRIDE, MPI_INT, &derived_dtp); MPI_Type_commit(&derived_dtp); MPI_Win_create(tar_buf, sizeof(int) * DATA_SIZE, sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &win); /***** test between rank 0 and rank 1 *****/ if (rank == 1) { MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win); for (i = 0; i < OPS_NUM; i++) { MPI_Accumulate(orig_buf, 1, derived_dtp, 0, 0, DATA_SIZE - COUNT, MPI_INT, MPI_SUM, win); MPI_Win_flush_local(0, win); } MPI_Win_unlock(0, win); } MPI_Barrier(MPI_COMM_WORLD); /* check results */ if (rank == 0) { for (i = 0; i < DATA_SIZE - COUNT; i++) { if (tar_buf[i] != OPS_NUM) { printf("tar_buf[%d] = %d, expected %d\n", i, tar_buf[i], OPS_NUM); errors++; } } } for (i = 0; i < DATA_SIZE; i++) { tar_buf[i] = 0; } MPI_Barrier(MPI_COMM_WORLD); /***** test between rank 0 and rank 2 *****/ if (rank == 2) { MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win); for (i = 0; i < OPS_NUM; i++) { MPI_Accumulate(orig_buf, 1, derived_dtp, 0, 0, DATA_SIZE - COUNT, MPI_INT, MPI_SUM, win); MPI_Win_flush_local(0, win); } MPI_Win_unlock(0, win); } MPI_Barrier(MPI_COMM_WORLD); /* check results */ if (rank == 0) { for (i = 0; i < DATA_SIZE - COUNT; i++) { if (tar_buf[i] != OPS_NUM) { printf("tar_buf[%d] = %d, expected %d\n", i, tar_buf[i], OPS_NUM); errors++; } } if (errors == 0) printf(" No Errors\n"); } MPI_Win_free(&win); MPI_Type_free(&derived_dtp); MPI_Free_mem(orig_buf); MPI_Free_mem(tar_buf); MPI_Finalize(); return 0; }
/* garray_get() */ int64_t garray_get(garray_t *ga, int64_t *lo, int64_t *hi, void *buf_) { int64_t count = (hi[0] - lo[0]) + 1, length = count * ga->elem_size, tlonid, tloidx, thinid, thiidx, tnid, tidx, n, oidx = 0; int8_t *buf = (int8_t *)buf_; calc_target(ga, lo[0], &tlonid, &tloidx); calc_target(ga, hi[0], &thinid, &thiidx); /* is all requested data on the same target? */ if (tlonid == thinid) { LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld-%ld, single target %ld.%ld\n", ga->g->nid, lo[0], hi[0], tlonid, tloidx); //MPI_Win_lock(MPI_LOCK_SHARED, tlonid, 0, ga->win); MPI_Get(buf, length, MPI_INT8_T, tlonid, (tloidx * ga->elem_size), length, MPI_INT8_T, ga->win); //MPI_Win_unlock(tlonid, ga->win); MPI_Win_flush_local(tlonid, ga->win); return 0; } /* get the data in the lo nid */ n = ga->nelems_per_node + (tlonid < ga->nextra_elems ? 1 : 0) - tloidx; LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld elements from %ld.%ld\n", ga->g->nid, n, tlonid, tloidx); //MPI_Win_lock(MPI_LOCK_SHARED, tlonid, 0, ga->win); MPI_Get(buf, (n * ga->elem_size), MPI_INT8_T, tlonid, (tloidx * ga->elem_size), (n * ga->elem_size), MPI_INT8_T, ga->win); //MPI_Win_unlock(tlonid, ga->win); oidx = (n * ga->elem_size); /* get the data in the in-between nids */ tidx = 0; for (tnid = tlonid + 1; tnid < thinid; ++tnid) { n = ga->nelems_per_node + (tnid < ga->nextra_elems ? 1 : 0); LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld elements from %ld.%ld\n", ga->g->nid, n, tnid, tidx); //MPI_Win_lock(MPI_LOCK_SHARED, tnid, 0, ga->win); MPI_Get(&buf[oidx], (n * ga->elem_size), MPI_INT8_T, tnid, 0, (n * ga->elem_size), MPI_INT8_T, ga->win); //MPI_Win_unlock(tnid, ga->win); oidx += (n * ga->elem_size); } /* get the data in the hi nid */ n = thiidx + 1; LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld elements up to %ld.%ld\n", ga->g->nid, n, thinid, thiidx); //MPI_Win_lock(MPI_LOCK_SHARED, thinid, 0, ga->win); MPI_Get(&buf[oidx], (n * ga->elem_size), MPI_INT8_T, thinid, 0, (n * ga->elem_size), MPI_INT8_T, ga->win); //MPI_Win_unlock(thinid, ga->win); MPI_Win_flush_local_all(ga->win); return 0; }
int main(int argc, char ** argv) { long Block_order; /* number of columns owned by rank */ long Block_size; /* size of a single block */ long Colblock_size; /* size of column block */ int Tile_order=32; /* default Tile order */ int tiling; /* boolean: true if tiling is used */ int Num_procs; /* number of ranks */ long order; /* order of overall matrix */ int send_to, recv_from; /* ranks with which to communicate */ long bytes; /* combined size of matrices */ int my_ID; /* rank */ int root=0; /* rank of root */ int iterations; /* number of times to do the transpose */ int i, j, it, jt, istart;/* dummies */ int iter; /* index of iteration */ int phase; /* phase inside staged communication */ int colstart; /* starting column for owning rank */ int error; /* error flag */ double RESTRICT *A_p; /* original matrix column block */ double RESTRICT *B_p; /* transposed matrix column block */ double RESTRICT *Work_in_p;/* workspace for transpose function */ double RESTRICT *Work_out_p;/* workspace for transpose function */ double abserr, /* absolute error */ abserr_tot; /* aggregate absolute error */ double epsilon = 1.e-8; /* error tolerance */ double local_trans_time, /* timing parameters */ trans_time, avgtime; MPI_Win rma_win = MPI_WIN_NULL; MPI_Info rma_winfo = MPI_INFO_NULL; int passive_target = 0; /* use passive target RMA sync */ #if MPI_VERSION >= 3 int flush_local = 1; /* flush local (or remote) after put */ int flush_bundle = 1; /* flush every <bundle> put calls */ #endif /********************************************************************* ** Initialize the MPI environment *********************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /********************************************************************* ** process, test and broadcast input parameters *********************************************************************/ error = 0; if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPIRMA matrix transpose: B = A^T\n"); if (argc <= 3){ printf("Usage: %s <# iterations> <matrix order> [Tile size]" "[sync (0=fence, 1=flush)] [flush local?] [flush bundle]\n", *argv); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if(iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } order = atol(*++argv); if (order < Num_procs) { printf("ERROR: matrix order %ld should at least # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (order%Num_procs) { printf("ERROR: matrix order %ld should be divisible by # procs %d\n", order, Num_procs); error = 1; goto ENDOFTESTS; } if (argc >= 4) Tile_order = atoi(*++argv); if (argc >= 5) passive_target = atoi(*++argv); #if MPI_VERSION >= 3 if (argc >= 6) flush_local = atoi(*++argv); if (argc >= 7) flush_bundle = atoi(*++argv); #endif ENDOFTESTS:; } bail_out(error); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Matrix order = %ld\n", order); printf("Number of iterations = %d\n", iterations); if ((Tile_order > 0) && (Tile_order < order)) printf("Tile size = %d\n", Tile_order); else printf("Untiled\n"); if (passive_target) { #if MPI_VERSION < 3 printf("Synchronization = MPI_Win_(un)lock\n"); #else printf("Synchronization = MPI_Win_flush%s (bundle=%d)\n", flush_local ? "_local" : "", flush_bundle); #endif } else { printf("Synchronization = MPI_Win_fence\n"); } } /* Broadcast input data to all ranks */ MPI_Bcast (&order, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast (&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&passive_target, 1, MPI_INT, root, MPI_COMM_WORLD); #if MPI_VERSION >= 3 MPI_Bcast (&flush_local, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast (&flush_bundle, 1, MPI_INT, root, MPI_COMM_WORLD); #endif /* a non-positive tile size means no tiling of the local transpose */ tiling = (Tile_order > 0) && (Tile_order < order); bytes = 2 * sizeof(double) * order * order; /********************************************************************* ** The matrix is broken up into column blocks that are mapped one to a ** rank. Each column block is made up of Num_procs smaller square ** blocks of order block_order. *********************************************************************/ Block_order = order/Num_procs; colstart = Block_order * my_ID; Colblock_size = order * Block_order; Block_size = Block_order * Block_order; /* debug message size effects */ if (my_ID == root) { printf("Block_size = %ld\n", Block_size); } /********************************************************************* ** Create the column block of the test matrix, the row block of the ** transposed matrix, and workspace (workspace only if #procs>1) *********************************************************************/ A_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (A_p == NULL){ printf(" Error allocating space for original matrix on node %d\n",my_ID); error = 1; } bail_out(error); MPI_Info_create (&rma_winfo); MPI_Info_set (rma_winfo, "no locks", "true"); B_p = (double *)prk_malloc(Colblock_size*sizeof(double)); if (B_p == NULL){ printf(" Error allocating space for transpose matrix on node %d\n",my_ID); error = 1; } bail_out(error); if (Num_procs>1) { Work_out_p = (double *) prk_malloc(Block_size*(Num_procs-1)*sizeof(double)); if (Work_out_p == NULL){ printf(" Error allocating space for work_out on node %d\n",my_ID); error = 1; } bail_out(error); PRK_Win_allocate(Block_size*(Num_procs-1)*sizeof(double), sizeof(double), rma_winfo, MPI_COMM_WORLD, &Work_in_p, &rma_win); if (Work_in_p == NULL){ printf(" Error allocating space for work on node %d\n",my_ID); error = 1; } bail_out(error); } #if MPI_VERSION >= 3 if (passive_target && Num_procs>1) { MPI_Win_lock_all(MPI_MODE_NOCHECK,rma_win); } #endif /* Fill the original column matrix */ istart = 0; for (j=0;j<Block_order;j++) { for (i=0;i<order; i++) { A(i,j) = (double) (order*(j+colstart) + i); B(i,j) = 0.0; } } MPI_Barrier(MPI_COMM_WORLD); for (iter = 0; iter<=iterations; iter++) { /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_trans_time = wtime(); } /* do the local transpose */ istart = colstart; if (!tiling) { for (i=0; i<Block_order; i++) { for (j=0; j<Block_order; j++) { B(j,i) += A(i,j); A(i,j) += 1.0; } } } else { for (i=0; i<Block_order; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) { for (it=i; it<MIN(Block_order,i+Tile_order); it++) { for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { B(jt,it) += A(it,jt); A(it,jt) += 1.0; } } } } } if (!passive_target && Num_procs>1) { MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, rma_win); } for (phase=1; phase<Num_procs; phase++){ send_to = (my_ID - phase + Num_procs)%Num_procs; istart = send_to*Block_order; if (!tiling) { for (i=0; i<Block_order; i++) { for (j=0; j<Block_order; j++) { Work_out(phase-1,j,i) = A(i,j); A(i,j) += 1.0; } } } else { for (i=0; i<Block_order; i+=Tile_order) { for (j=0; j<Block_order; j+=Tile_order) { for (it=i; it<MIN(Block_order,i+Tile_order); it++) { for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) { Work_out(phase-1,jt,it) = A(it,jt); A(it,jt) += 1.0; } } } } } #if MPI_VERSION < 3 if (passive_target) { MPI_Win_lock(MPI_LOCK_SHARED, send_to, MPI_MODE_NOCHECK, rma_win); } #endif MPI_Put(Work_out_p+Block_size*(phase-1), Block_size, MPI_DOUBLE, send_to, Block_size*(phase-1), Block_size, MPI_DOUBLE, rma_win); if (passive_target) { #if MPI_VERSION < 3 MPI_Win_unlock(send_to, rma_win); #else if (flush_bundle==1) { if (flush_local==1) { MPI_Win_flush_local(send_to, rma_win); } else { MPI_Win_flush(send_to, rma_win); } } else if ( (phase%flush_bundle) == 0) { /* Too lazy to record all targets, so let MPI do it internally (hopefully) */ if (flush_local==1) { MPI_Win_flush_local_all(rma_win); } else { MPI_Win_flush_all(rma_win); } } #endif } } /* end of phase loop for puts */ if (Num_procs>1) { if (passive_target) { #if MPI_VERSION >= 3 MPI_Win_flush_all(rma_win); #endif MPI_Barrier(MPI_COMM_WORLD); } else { MPI_Win_fence(MPI_MODE_NOSTORE, rma_win); } } for (phase=1; phase<Num_procs; phase++) { recv_from = (my_ID + phase)%Num_procs; istart = recv_from*Block_order; /* scatter received block to transposed matrix; no need to tile */ for (j=0; j<Block_order; j++) { for (i=0; i<Block_order; i++) { B(i,j) += Work_in(phase-1,i,j); } } } /* end of phase loop for scatters */ /* for the flush case we need to make sure we have consumed Work_in before overwriting it in the next iteration */ if (Num_procs>1 && passive_target) { MPI_Barrier(MPI_COMM_WORLD); } } /* end of iterations */ local_trans_time = wtime() - local_trans_time; MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); abserr = 0.0; istart = 0; double addit = ((double)(iterations+1) * (double) (iterations))/2.0; for (j=0;j<Block_order;j++) { for (i=0;i<order; i++) { abserr += ABS(B(i,j) - ((double)(order*i + j+colstart)*(iterations+1)+addit)); } } MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); if (my_ID == root) { if (abserr_tot < epsilon) { printf("Solution validates\n"); avgtime = trans_time/(double)iterations; printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime); } else { printf("ERROR: Aggregate absolute error %lf exceeds threshold %e\n", abserr_tot, epsilon); error = 1; } } bail_out(error); if (rma_win!=MPI_WIN_NULL) { #if MPI_VERSION >=3 if (passive_target) { MPI_Win_unlock_all(rma_win); } #endif PRK_Win_free(&rma_win); } MPI_Finalize(); exit(EXIT_SUCCESS); } /* end of main */
int main(int argc, char *argv[]) { MPI_Win counter, table; char commands[MAX_COMMANDS][MAX_COMMAND_LEN] = {{0}}; int rank, comm_size, i; // setenv ("MPICH_ASYNC_PROGRESS", "1", 0); MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &comm_size); MPI_Win_create (&commands, MAX_COMMANDS * MAX_COMMAND_LEN, MAX_COMMAND_LEN, MPI_INFO_NULL, MPI_COMM_WORLD, &table); MPI_Win_fence (0, table); // Distribute command lines to tasks, round-robin, start from task 1 i = 0; if (rank == 0) { char line[MAX_COMMAND_LEN + 2]; MPI_Win_lock_all (MPI_MODE_NOCHECK, table); while (fgets (line, MAX_COMMAND_LEN + 2, stdin) != NULL) { if (i >= MAX_COMMANDS * comm_size) { fprintf (stderr, "MAX_COMMANDS * comm_size (%d) exceeded.\n", i); MPI_Abort(MPI_COMM_WORLD, 1); } if (strlen (line) > MAX_COMMAND_LEN) { fprintf (stderr, "MAX_COMMAND_LEN exceeded, line %d: %s\n", i, line); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Aint disp = (i / comm_size); int target_rank = (i + 1) % comm_size; MPI_Put (line, MAX_COMMAND_LEN, MPI_CHAR, target_rank, disp, MAX_COMMAND_LEN, MPI_CHAR, table); MPI_Win_flush_local (target_rank, table); i++; } MPI_Win_unlock_all (table); } MPI_Win_fence (0, table); // Initialize next_command counter/pointer to the top of the command // line stack. int next_command; MPI_Win_create (&next_command, sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &counter); for (i = MAX_COMMANDS - 1; i >= 0; i--) { if (commands[i][0]) { next_command = i; break; } } MPI_Barrier (MPI_COMM_WORLD); // Execute command lines // // Process commands from own rank + steal_increment const int dec = -1; int steal_increment = 0; int current_command; while (steal_increment < comm_size) { int current_rank = (rank + steal_increment) % comm_size; MPI_Win_lock (MPI_LOCK_SHARED, current_rank, 0, counter); MPI_Fetch_and_op (&dec, ¤t_command, MPI_INT, current_rank, 0, MPI_SUM, counter); MPI_Win_unlock (current_rank, counter); if (current_command < 0) { steal_increment++; } else { char command[MAX_COMMAND_LEN] = {0}; MPI_Win_lock (MPI_LOCK_SHARED, current_rank, MPI_MODE_NOCHECK, table); MPI_Get (&command, MAX_COMMAND_LEN, MPI_CHAR, current_rank, current_command, MAX_COMMAND_LEN, MPI_CHAR, table); MPI_Win_unlock (current_rank, table); system (command); } } MPI_Win_free (&counter); MPI_Win_free (&table); MPI_Barrier (MPI_COMM_WORLD); MPI_Finalize (); exit (0); }
FORT_DLL_SPEC void FORT_CALL mpi_win_flush_local_ ( MPI_Fint *v1, MPI_Fint *v2, MPI_Fint *ierr ){ *ierr = MPI_Win_flush_local( (int)*v1, (MPI_Win)*v2 ); }